In [ ]:
# ============================
#  Title:  Multi-Classifier Modeling, Hyperparameter Tuning & Evaluation
#  Author: Siyang Ni
#  Date:   [Date]
#  Notes:  This script showcases a comprehensive pipeline for loading data,
#          preprocessing, model training, hyperparameter tuning, and evaluation
#          across multiple algorithms: RandomForest, GradientBoosting,
#          HistGradientBoosting, XGBoost, and CatBoost. Includes interpretability
#          with SHAP, partial dependence plots, and feature importances.
# ============================

Setting Up¶

In [ ]:
# ================
# 1. IMPORTS
# ================

# !pip install --upgrade pandas numpy matplotlib seaborn joblib scikit-learn xgboost catboost shap optuna
# pip install --upgrade ipywidgets

import os
import logging
import warnings
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV, RepeatedStratifiedKFold
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    HistGradientBoostingClassifier 
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Interpretability

from sklearn.inspection import permutation_importance, PartialDependenceDisplay
import shap

# Optimization
import optuna
In [ ]:
# ================
# 2. CONFIGURATION
# ================
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_SPLITS_CV = 5
SCORING_METRIC = 'roc_auc'
VERBOSE = 1

CPU_COUNT = os.cpu_count()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
In [ ]:
# ================
# 3. HELPER FUNCTIONS
# ================

def load_data(filepath: str) -> pd.DataFrame:
    """
    Loads data from a CSV file into a Pandas DataFrame.
    
    Parameters
    ----------
    filepath : str
        Full path to the CSV file.
    
    Returns
    -------
    pd.DataFrame or None
        Loaded DataFrame if successful, None if file not found.
    """
    try:
        df = pd.read_csv(os.path.expanduser(filepath))
        logging.info("Data loaded successfully.")
        return df
    except FileNotFoundError:
        logging.error(f"File not found at {filepath}")
        return None


def identify_categorical_columns(df: pd.DataFrame) -> list:
    """
    Identifies columns of type object or category in a DataFrame.
    """
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    logging.info(f"Identified categorical columns: {categorical_cols}")
    return categorical_cols


def convert_to_categorical(df: pd.DataFrame, columns: list) -> None:
    """
    Converts specified columns in a DataFrame to categorical type in-place.
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
        else:
            logging.warning(f"Column '{col}' not found in DataFrame.")
    logging.info("Categorical conversion complete.")


def create_train_test_split(
    X: pd.DataFrame, 
    y: pd.Series, 
    test_size: float = TEST_SIZE, 
    random_state: int = RANDOM_STATE
) -> tuple:
    """
    Splits data into training and testing sets.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, 
        stratify=y, shuffle=True
    )
    logging.info(f"Training set shape: {X_train.shape}")
    logging.info(f"Testing set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test


def create_missing_indicators(
    X_train: pd.DataFrame, 
    X_test: pd.DataFrame
) -> tuple:
    """
    Creates binary indicators for missing values in features.
    """
    missing_indicator = MissingIndicator(features='all')
    missing_indicator.fit(X_train)
    X_train_flags = missing_indicator.transform(X_train)
    X_test_flags = missing_indicator.transform(X_test)
    
    missing_columns = [f'missing_{col}' for col in X_train.columns]
    X_train_with_indicators = pd.concat(
        [X_train.reset_index(drop=True),
         pd.DataFrame(X_train_flags, columns=missing_columns)],
        axis=1
    )
    X_test_with_indicators = pd.concat(
        [X_test.reset_index(drop=True),
         pd.DataFrame(X_test_flags, columns=missing_columns)],
        axis=1
    )
    logging.info("Missing indicators created.")
    return X_train_with_indicators, X_test_with_indicators


def create_preprocessor(categorical_features: list) -> ColumnTransformer:
    """
    Creates a preprocessor for categorical features using OneHotEncoder
    while passing other columns through without transformation.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            (
                'cat', 
                OneHotEncoder(
                    drop='first', 
                    handle_unknown='ignore'
                ),
                categorical_features
            )
        ],
        remainder='passthrough'
    )
    return preprocessor


def train_evaluate_model(
    model, 
    X_train: pd.DataFrame, 
    y_train: pd.Series, 
    X_test: pd.DataFrame, 
    y_test: pd.Series, 
    model_name: str = "Model", 
    save_path: str = None
):
    """
    Trains, evaluates, and optionally saves a model. 
    Prints confusion matrix, classification report, and ROC AUC.
    Plots the ROC curve.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    logging.info(f"=== {model_name} Evaluation ===")
    logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred)))
    logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred)))
    logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    
    # Plot ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve on Test Data')
    plt.legend(loc='lower right')
    plt.show()
    
    if save_path:
        joblib.dump(model, save_path)
        logging.info(f"{model_name} saved to '{save_path}'.")
    
    return model


def perform_grid_search(
    pipeline: Pipeline, 
    param_grid: dict, 
    X_train: pd.DataFrame, 
    y_train: pd.Series, 
    cv=None, 
    scoring: str = 'roc_auc', 
    n_jobs: int = -1, 
    verbose: int = 1
):
    """
    Performs GridSearchCV for hyperparameter tuning on a pipeline.
    """
    if cv is None:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        estimator=pipeline, 
        param_grid=param_grid, 
        cv=cv, 
        scoring=scoring, 
        n_jobs=n_jobs, 
        verbose=verbose
    )
    grid_search.fit(X_train, y_train)
    logging.info("Best parameters found: " + str(grid_search.best_params_))
    logging.info(f"Best cross-validation {scoring}: {grid_search.best_score_:.4f}")
    return grid_search.best_estimator_


def plot_feature_importance(
    model, 
    feature_names: list, 
    top_n: int = 20, 
    title: str = "Feature Importance"
):
    """
    Plots the top N feature importances from a trained model.
    """
    if hasattr(model, 'feature_importances_'): 
        importances = model.feature_importances_
    elif hasattr(model, 'named_steps') and 'classifier' in model.named_steps:
        if hasattr(model.named_steps['classifier'], 'feature_importances_'):
            importances = model.named_steps['classifier'].feature_importances_
        else:
            raise ValueError("Classifier does not have feature_importances_ attribute.")
    else:
        raise ValueError("Provided model does not have feature_importances_ attribute.")
    
    fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    fi_df = fi_df.sort_values('Importance', ascending=False).head(top_n)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=fi_df)
    plt.title(title)
    plt.tight_layout()
    plt.show()


def aggregate_feature_importance(importances: np.ndarray, encoded_feature_names: list) -> pd.DataFrame:
    """
    Aggregates feature importance of one-hot-encoded features back to original feature names.
    """
    original_features = list(set([feat.split('_')[0] for feat in encoded_feature_names]))
    original_feature_importance = {feature: 0 for feature in original_features}
    for i, encoded_feature in enumerate(encoded_feature_names):
        base_feature = encoded_feature.split('_')[0]
        original_feature_importance[base_feature] += importances[i]
    
    importance_df = pd.DataFrame(
        list(original_feature_importance.items()), 
        columns=['Feature', 'Importance']
    )
    importance_df = importance_df.sort_values('Importance', ascending=False)
    return importance_df


def plot_aggregated_feature_importance(
    importance_df: pd.DataFrame, 
    top_n: int = 20, 
    title: str = "Aggregated Feature Importance"
):
    """
    Plots aggregated feature importances after grouping by base feature.
    """
    top_n_df = importance_df.head(top_n).sort_values(by='Importance', ascending=True)
    plt.figure(figsize=(10, 6))
    plt.barh(y=top_n_df['Feature'], width=top_n_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(title)
    plt.tight_layout()
    plt.show()

Preprocessing¶

In [ ]:
# --- Data Loading ---
data_filepath = os.path.expanduser('~/work/vaping_project_data/processed_data_g12nn.csv')
new_data = load_data(data_filepath)
if new_data is None:
    logging.error("Data loading failed. Exiting script.")
    raise SystemExit

logging.info("Dataset Info:")
new_data.info()
2025-02-15 11:02:48,145 - INFO - Data loaded successfully.
2025-02-15 11:02:48,146 - INFO - Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32730 entries, 0 to 32729
Data columns (total 51 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   V2178           31840 non-null  float64
 1   V2188           31432 non-null  float64
 2   V2197           31034 non-null  float64
 3   V2184           30687 non-null  float64
 4   V2186           31432 non-null  float64
 5   V2171           32096 non-null  float64
 6   V2128           32426 non-null  float64
 7   V2201           31005 non-null  float64
 8   V2173           31855 non-null  float64
 9   V2194           31412 non-null  float64
 10  V2166           31888 non-null  float64
 11  wave            32730 non-null  int64  
 12  V2176           30613 non-null  float64
 13  V2175           31149 non-null  float64
 14  V2177           31046 non-null  float64
 15  nicotine12d     32730 non-null  float64
 16  V2116           32035 non-null  float64
 17  V2125           32302 non-null  float64
 18  V2182           30414 non-null  float64
 19  sex             32730 non-null  float64
 20  race            32730 non-null  float64
 21  V2460           31446 non-null  float64
 22  RESPONDENT_AGE  32665 non-null  float64
 23  V2185           31432 non-null  float64
 24  V2193           29539 non-null  float64
 25  V2163           32328 non-null  float64
 26  V49             32516 non-null  float64
 27  V2108           31410 non-null  float64
 28  V2101           32452 non-null  float64
 29  V2180           30589 non-null  float64
 30  V2164           32315 non-null  float64
 31  V2191           31502 non-null  float64
 32  V2195           31287 non-null  float64
 33  V2155           32448 non-null  float64
 34  V2196           31293 non-null  float64
 35  V2189           31432 non-null  float64
 36  V2179           31819 non-null  float64
 37  V13             32730 non-null  int64  
 38  V2143           32494 non-null  float64
 39  V2134           32450 non-null  float64
 40  V2172           31932 non-null  float64
 41  V2137           32397 non-null  float64
 42  V2140           32545 non-null  float64
 43  V2105           31702 non-null  float64
 44  V2157           32448 non-null  float64
 45  V2183           31191 non-null  float64
 46  V2187           31432 non-null  float64
 47  V2181           30447 non-null  float64
 48  V2152           32636 non-null  float64
 49  V2153           32451 non-null  float64
 50  V2156           32448 non-null  float64
dtypes: float64(49), int64(2)
memory usage: 12.7 MB
In [ ]:
# -----------------------------------------------------------------------------
# Missing Data Analysis
# -----------------------------------------------------------------------------

total_missing = new_data.isna().sum().sum()
print("\nTotal missing values:", total_missing)

# Count negative values in numeric columns.
numeric_cols = new_data.select_dtypes(include=[np.number]).columns
negative_counts = new_data[numeric_cols].apply(lambda x: (x < 0).sum())
negative_counts_df = pd.DataFrame({
    'Column': negative_counts.index,
    'Negative_Count': negative_counts.values
})
print("\nNegative value counts by numeric column:")
print(negative_counts_df)

# Replace negative codes (-9, -8) with NaN.
missing_codes = [-9, -8]
new_data[numeric_cols] = new_data[numeric_cols].replace({-9: np.nan, -8: np.nan})

# Compute missing counts and percentages.
missing_counts = new_data.isna().sum()
missing_percent = (new_data.isna().mean() * 100).round(2)
missing_summary = pd.DataFrame({
    'Missing_Count': missing_counts,
    'Missing_Percentage': missing_percent
}).sort_values(by='Missing_Percentage', ascending=False)
print("\nMissing values summary:")
print(missing_summary.to_string())
Total missing values: 47868

Negative value counts by numeric column:
            Column  Negative_Count
0            V2178               0
1            V2188               0
2            V2197               0
3            V2184               0
4            V2186               0
5            V2171               0
6            V2128               0
7            V2201               0
8            V2173               0
9            V2194               0
10           V2166               0
11            wave               0
12           V2176               0
13           V2175               0
14           V2177               0
15     nicotine12d               0
16           V2116               0
17           V2125               0
18           V2182               0
19             sex               0
20            race               0
21           V2460               0
22  RESPONDENT_AGE               0
23           V2185               0
24           V2193               0
25           V2163               0
26             V49               0
27           V2108               0
28           V2101               0
29           V2180               0
30           V2164               0
31           V2191               0
32           V2195               0
33           V2155               0
34           V2196               0
35           V2189               0
36           V2179               0
37             V13               0
38           V2143               0
39           V2134               0
40           V2172               0
41           V2137               0
42           V2140               0
43           V2105               0
44           V2157               0
45           V2183               0
46           V2187               0
47           V2181               0
48           V2152               0
49           V2153               0
50           V2156               0

Missing values summary:
                Missing_Count  Missing_Percentage
V2193                    3191                9.75
V2182                    2316                7.08
V2181                    2283                6.98
V2180                    2141                6.54
V2176                    2117                6.47
V2184                    2043                6.24
V2201                    1725                5.27
V2197                    1696                5.18
V2177                    1684                5.15
V2175                    1581                4.83
V2183                    1539                4.70
V2195                    1443                4.41
V2196                    1437                4.39
V2108                    1320                4.03
V2194                    1318                4.03
V2189                    1298                3.97
V2188                    1298                3.97
V2185                    1298                3.97
V2187                    1298                3.97
V2186                    1298                3.97
V2460                    1284                3.92
V2191                    1228                3.75
V2105                    1028                3.14
V2179                     911                2.78
V2178                     890                2.72
V2173                     875                2.67
V2166                     842                2.57
V2172                     798                2.44
V2116                     695                2.12
V2171                     634                1.94
V2125                     428                1.31
V2164                     415                1.27
V2163                     402                1.23
V2137                     333                1.02
V2128                     304                0.93
V2155                     282                0.86
V2134                     280                0.86
V2157                     282                0.86
V2156                     282                0.86
V2101                     278                0.85
V2153                     279                0.85
V2143                     236                0.72
V49                       214                0.65
V2140                     185                0.57
V2152                      94                0.29
RESPONDENT_AGE             65                0.20
race                        0                0.00
V13                         0                0.00
sex                         0                0.00
nicotine12d                 0                0.00
wave                        0                0.00
In [ ]:
# Correlation Analysis
# Select numeric variables (excluding the target variable 'nicotine12d' if desired).
cor_vars = new_data.drop(columns=['nicotine12d'], errors='ignore').select_dtypes(include=[np.number])

# Compute the Spearman correlation matrix.
cor_matrix_spearman = cor_vars.corr(method='spearman')

# Check for non-finite values in the correlation matrix.
if not np.all(np.isfinite(cor_matrix_spearman)):
    print("\nWarning: Non-finite values detected in the correlation matrix.")
    # Replace NaN or infinite values with 0 (or another appropriate value).
    cor_matrix_spearman = cor_matrix_spearman.fillna(0)
    cor_matrix_spearman = cor_matrix_spearman.replace([np.inf, -np.inf], 0)

print("\nSpearman Correlation Matrix:")
print(cor_matrix_spearman)

# Create an enhanced heatmap with clustering.
clustergrid = sns.clustermap(cor_matrix_spearman, cmap="coolwarm", figsize=(12, 12))
clustergrid.ax_heatmap.set_title("Enhanced Spearman Correlation Heatmap")
plt.show()

# Identify highly correlated pairs (absolute correlation > 0.5 and less than 1).
high_corr_pairs = []
cols = cor_matrix_spearman.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        corr_value = cor_matrix_spearman.iloc[i, j]
        if 0.5 < abs(corr_value) < 1:
            high_corr_pairs.append({
                "Variable1": cols[i],
                "Variable2": cols[j],
                "Correlation": corr_value
            })

high_corr_df = pd.DataFrame(high_corr_pairs)
print("\nHighly correlated variable pairs (|corr| > 0.5):")
print(high_corr_df)
Spearman Correlation Matrix:
                   V2178     V2188     V2197     V2184     V2186     V2171  \
V2178           1.000000 -0.049629  0.093042 -0.015743  0.008092  0.046024   
V2188          -0.049629  1.000000 -0.042752  0.321003 -0.026227 -0.102099   
V2197           0.093042 -0.042752  1.000000 -0.035598  0.033113  0.020207   
V2184          -0.015743  0.321003 -0.035598  1.000000 -0.065073 -0.059107   
V2186           0.008092 -0.026227  0.033113 -0.065073  1.000000  0.022292   
V2171           0.046024 -0.102099  0.020207 -0.059107  0.022292  1.000000   
V2128           0.089366 -0.008868  0.071150 -0.007436  0.017063  0.014454   
V2201           0.058376  0.004788  0.216064  0.014461  0.019671 -0.000423   
V2173          -0.093280  0.228164 -0.017428  0.280610 -0.028203 -0.078977   
V2194           0.104285  0.012967  0.136308  0.021631  0.001037 -0.024662   
V2166           0.024141 -0.060672 -0.072844 -0.039552 -0.060795  0.041513   
wave            0.008739 -0.057778 -0.006440 -0.082878 -0.043683  0.011258   
V2176           0.414728 -0.035326  0.094930 -0.025964  0.002425  0.009332   
V2175           0.097227 -0.023300  0.032360  0.004988  0.011282  0.010744   
V2177           0.101125  0.010241  0.035401  0.025626  0.010682 -0.004172   
V2116           0.213772 -0.020080  0.143596 -0.033305  0.018758  0.012502   
V2125           0.091962 -0.028509  0.074941 -0.015441  0.021268  0.039509   
V2182           0.044384 -0.039058 -0.019987 -0.028834  0.044192 -0.000364   
sex             0.003834  0.144495 -0.063447  0.208078 -0.123320 -0.055563   
race            0.061858 -0.002563 -0.049980  0.016777 -0.002884  0.022663   
V2460           0.072343 -0.037442  0.064067 -0.011501  0.020640  0.059038   
RESPONDENT_AGE  0.006229 -0.050273  0.052265 -0.043983  0.020336  0.022078   
V2185           0.017683 -0.184175  0.026835 -0.164759  0.133994  0.027523   
V2193           0.084958  0.036811  0.050691  0.066786  0.010752 -0.008691   
V2163          -0.040773  0.055863 -0.007004  0.095256 -0.018676  0.002841   
V49             0.026545 -0.051102  0.005809 -0.057999  0.046493  0.012928   
V2108           0.138442 -0.010992  0.150654  0.008852  0.025111 -0.000746   
V2101           0.136726 -0.093390  0.134853 -0.078096  0.064537  0.034624   
V2180           0.032746 -0.202660  0.012872 -0.109447  0.086755  0.025446   
V2164          -0.042054  0.041992  0.020642  0.116081 -0.014283  0.002094   
V2191           0.088517 -0.061298  0.145793 -0.062489  0.066350 -0.007981   
V2195           0.064231 -0.009012  0.140739  0.013028  0.012680 -0.004597   
V2155          -0.043915  0.086994 -0.019294  0.039762 -0.010458 -0.064640   
V2196           0.046534  0.020896  0.275856  0.021864  0.030154 -0.049781   
V2189          -0.028700  0.382564 -0.027480  0.583240 -0.025607 -0.067127   
V2179          -0.163579  0.220732 -0.040714  0.257089 -0.070399 -0.081527   
V13             0.118681 -0.020140  0.017733 -0.015608  0.017816  0.033269   
V2143           0.077951 -0.008014  0.059433 -0.009263  0.025706  0.016391   
V2134           0.064991 -0.007664  0.042174  0.002483  0.011987  0.012277   
V2172           0.058068 -0.254797 -0.003818 -0.258314  0.044257  0.091775   
V2137           0.079653 -0.007028  0.056565 -0.001482  0.012795  0.025179   
V2140           0.026645 -0.024031  0.031365  0.005124  0.000927  0.038686   
V2105           0.172417  0.028762  0.175986  0.042125  0.032539 -0.015343   
V2157          -0.013618  0.077304 -0.037788  0.044640 -0.006680 -0.048150   
V2183          -0.058165  0.536921 -0.029821  0.526226 -0.088325 -0.113955   
V2187           0.014367 -0.005225 -0.018045 -0.082054  0.050283  0.014671   
V2181           0.017542 -0.095575  0.050725 -0.050498  0.534619  0.036806   
V2152           0.040213  0.128085 -0.046285  0.142611 -0.055261 -0.035358   
V2153          -0.024463  0.018920 -0.017911 -0.020044  0.004421 -0.027370   
V2156          -0.027227  0.118125 -0.031946  0.093757 -0.028257 -0.085951   

                   V2128     V2201     V2173     V2194  ...     V2137  \
V2178           0.089366  0.058376 -0.093280  0.104285  ...  0.079653   
V2188          -0.008868  0.004788  0.228164  0.012967  ... -0.007028   
V2197           0.071150  0.216064 -0.017428  0.136308  ...  0.056565   
V2184          -0.007436  0.014461  0.280610  0.021631  ... -0.001482   
V2186           0.017063  0.019671 -0.028203  0.001037  ...  0.012795   
V2171           0.014454 -0.000423 -0.078977 -0.024662  ...  0.025179   
V2128           1.000000  0.054623  0.000138  0.069551  ...  0.307463   
V2201           0.054623  1.000000  0.027688  0.096337  ...  0.031748   
V2173           0.000138  0.027688  1.000000  0.042823  ... -0.011170   
V2194           0.069551  0.096337  0.042823  1.000000  ...  0.048543   
V2166          -0.035053 -0.062426 -0.163089 -0.091508  ... -0.007848   
wave           -0.065838  0.013591 -0.011573 -0.035767  ... -0.067752   
V2176           0.097158  0.074899 -0.085093  0.098826  ...  0.080364   
V2175           0.042698  0.027308 -0.088557  0.044307  ...  0.046814   
V2177           0.039213  0.036957 -0.039010  0.042682  ...  0.026470   
V2116           0.211496  0.113149 -0.053363  0.174373  ...  0.182534   
V2125           0.279562  0.040517 -0.021766  0.061210  ...  0.250703   
V2182           0.001720 -0.037450 -0.205418 -0.031576  ...  0.009013   
sex            -0.022753  0.018452 -0.018493 -0.062476  ...  0.000668   
race           -0.012443 -0.050088 -0.097910 -0.023764  ...  0.005081   
V2460           0.167413  0.035037 -0.020208  0.028924  ...  0.144419   
RESPONDENT_AGE  0.009429  0.009460 -0.014265  0.018860  ...  0.000563   
V2185           0.015976  0.002617 -0.108135 -0.008964  ...  0.016480   
V2193           0.037751  0.028148  0.018646  0.177822  ...  0.039115   
V2163           0.006270  0.026093  0.140415  0.029167  ... -0.013144   
V49             0.004094 -0.015206 -0.120524 -0.044455  ... -0.003172   
V2108           0.194583  0.092103  0.011494  0.194339  ...  0.130007   
V2101           0.212239  0.078118 -0.076818  0.107508  ...  0.176522   
V2180           0.007736 -0.019815 -0.176829 -0.022944  ...  0.015510   
V2164           0.009983  0.045771  0.159947  0.049470  ... -0.012324   
V2191           0.049851  0.113170 -0.048448  0.054998  ...  0.031524   
V2195           0.043004  0.096540  0.029210  0.287661  ...  0.036775   
V2155          -0.010384  0.017098  0.119758  0.040539  ... -0.028719   
V2196           0.059567  0.223303  0.127376  0.256581  ...  0.024912   
V2189           0.003447  0.024631  0.260970  0.013663  ...  0.009314   
V2179          -0.049856  0.017037  0.538661  0.004064  ... -0.044086   
V13             0.012240 -0.030317 -0.085578 -0.028648  ...  0.023291   
V2143           0.301238  0.037267 -0.014579  0.058836  ...  0.338213   
V2134           0.225288  0.037230 -0.008580  0.024640  ...  0.318344   
V2172          -0.003964 -0.045249 -0.302182 -0.047912  ... -0.000513   
V2137           0.307463  0.031748 -0.011170  0.048543  ...  1.000000   
V2140           0.087342  0.005181 -0.003708  0.011889  ...  0.128621   
V2105           0.180318  0.132834  0.043948  0.225754  ...  0.138353   
V2157          -0.026373 -0.008458  0.033399 -0.016665  ... -0.032433   
V2183          -0.010688  0.026671  0.365098  0.045440  ... -0.012606   
V2187          -0.002975 -0.028137 -0.169927 -0.023250  ...  0.007803   
V2181           0.026554  0.019818 -0.064495  0.005395  ...  0.015473   
V2152           0.010043 -0.008924  0.109886  0.029292  ...  0.002346   
V2153          -0.007439 -0.005400 -0.008876 -0.021156  ... -0.014636   
V2156          -0.027584 -0.000554  0.088407  0.015746  ... -0.032821   

                   V2140     V2105     V2157     V2183     V2187     V2181  \
V2178           0.026645  0.172417 -0.013618 -0.058165  0.014367  0.017542   
V2188          -0.024031  0.028762  0.077304  0.536921 -0.005225 -0.095575   
V2197           0.031365  0.175986 -0.037788 -0.029821 -0.018045  0.050725   
V2184           0.005124  0.042125  0.044640  0.526226 -0.082054 -0.050498   
V2186           0.000927  0.032539 -0.006680 -0.088325  0.050283  0.534619   
V2171           0.038686 -0.015343 -0.048150 -0.113955  0.014671  0.036806   
V2128           0.087342  0.180318 -0.026373 -0.010688 -0.002975  0.026554   
V2201           0.005181  0.132834 -0.008458  0.026671 -0.028137  0.019818   
V2173          -0.003708  0.043948  0.033399  0.365098 -0.169927 -0.064495   
V2194           0.011889  0.225754 -0.016665  0.045440 -0.023250  0.005395   
V2166          -0.003089 -0.159747 -0.014198 -0.109900  0.086412 -0.047589   
wave            0.000054 -0.068175 -0.002512 -0.094130  0.024373 -0.052886   
V2176           0.029716  0.222134 -0.005094 -0.055107  0.014491  0.010430   
V2175           0.017742  0.079870 -0.026631 -0.041746  0.053744  0.016969   
V2177           0.014397  0.079985  0.007484 -0.004764  0.025826  0.013020   
V2116           0.029111  0.532954 -0.045172 -0.034615 -0.017631  0.028145   
V2125           0.191464  0.142629 -0.031526 -0.029624 -0.003327  0.029798   
V2182          -0.001022 -0.097161  0.009732 -0.187248  0.594918  0.165268   
sex            -0.013659  0.018516  0.035184  0.199213  0.000268 -0.224100   
race           -0.005319  0.002361  0.098590 -0.061729  0.096779 -0.003990   
V2460           0.214431  0.070677 -0.027215 -0.031959 -0.007748  0.039341   
RESPONDENT_AGE  0.001903  0.024208 -0.041384 -0.049884  0.003464  0.030982   
V2185           0.001801 -0.012093 -0.019729 -0.288984  0.185083  0.127662   
V2193           0.020512  0.098869 -0.039690  0.058654 -0.000871  0.005188   
V2163           0.002053  0.026985 -0.003156  0.128013 -0.098676 -0.023328   
V49            -0.005730 -0.033637  0.178629 -0.110485  0.081026  0.074250   
V2108           0.039752  0.542470 -0.014267  0.025048 -0.054777  0.014596   
V2101           0.046630  0.390300 -0.071695 -0.109326  0.003128  0.075156   
V2180           0.012439 -0.059889 -0.010921 -0.329192  0.191071  0.283127   
V2164           0.000331  0.054901 -0.005803  0.136287 -0.115267 -0.018556   
V2191           0.005816  0.132798  0.000377 -0.081921  0.047335  0.088670   
V2195           0.011747  0.175860 -0.023720  0.012994  0.006767  0.026602   
V2155          -0.024549  0.050861  0.230122  0.106558 -0.059579 -0.028031   
V2196           0.008922  0.242067 -0.009056  0.083057 -0.081563  0.047621   
V2189          -0.016317  0.070536  0.056400  0.394245 -0.055655 -0.081609   
V2179           0.001356 -0.000841  0.036419  0.365207 -0.165824 -0.122872   
V13             0.001935 -0.069998 -0.010225 -0.082091  0.124938  0.038828   
V2143           0.151030  0.134530 -0.029176 -0.019911  0.003988  0.025124   
V2134           0.126647  0.094124 -0.014274 -0.016830  0.009191  0.018325   
V2172           0.011229 -0.102753 -0.035586 -0.356496  0.130779  0.090098   
V2137           0.128621  0.138353 -0.032433 -0.012606  0.007803  0.015473   
V2140           1.000000  0.025843 -0.027812 -0.013527 -0.006684  0.026274   
V2105           0.025843  1.000000 -0.017055  0.067867 -0.082546  0.014312   
V2157          -0.027812 -0.017055  1.000000  0.056553  0.011611 -0.017673   
V2183          -0.013527  0.067867  0.056553  1.000000 -0.205304 -0.162375   
V2187          -0.006684 -0.082546  0.011611 -0.205304  1.000000  0.052120   
V2181           0.026274  0.014312 -0.017673 -0.162375  0.052120  1.000000   
V2152          -0.011394  0.043009  0.044792  0.174220 -0.072245 -0.080834   
V2153          -0.029016 -0.034165  0.003419 -0.007107  0.016330  0.012776   
V2156          -0.042198  0.001628  0.269085  0.142227 -0.025125 -0.050770   

                   V2152     V2153     V2156  
V2178           0.040213 -0.024463 -0.027227  
V2188           0.128085  0.018920  0.118125  
V2197          -0.046285 -0.017911 -0.031946  
V2184           0.142611 -0.020044  0.093757  
V2186          -0.055261  0.004421 -0.028257  
V2171          -0.035358 -0.027370 -0.085951  
V2128           0.010043 -0.007439 -0.027584  
V2201          -0.008924 -0.005400 -0.000554  
V2173           0.109886 -0.008876  0.088407  
V2194           0.029292 -0.021156  0.015746  
V2166           0.003758  0.011218 -0.061093  
wave           -0.030708 -0.080296 -0.005674  
V2176          -0.005422 -0.012170 -0.027676  
V2175          -0.010290 -0.006780 -0.032018  
V2177          -0.011738  0.000419 -0.013191  
V2116           0.049691 -0.017990 -0.055293  
V2125          -0.005147 -0.019401 -0.045825  
V2182          -0.074204  0.026510 -0.040563  
sex             0.009143  0.002239  0.028336  
race            0.056905 -0.004022  0.051642  
V2460          -0.003214 -0.023490 -0.035494  
RESPONDENT_AGE -0.044644  0.003840 -0.049813  
V2185          -0.096766  0.005720 -0.042205  
V2193           0.059684 -0.020532 -0.006511  
V2163           0.058871  0.001608  0.047499  
V49            -0.057192  0.010428 -0.083682  
V2108           0.023486 -0.028804 -0.004466  
V2101          -0.060251 -0.023931 -0.095518  
V2180          -0.111524  0.016348 -0.053369  
V2164           0.041675 -0.022228  0.051417  
V2191          -0.097052 -0.003669 -0.052103  
V2195          -0.031077 -0.063056 -0.021474  
V2155           0.032753 -0.021865  0.205573  
V2196          -0.111577 -0.032775  0.036915  
V2189           0.118544 -0.005428  0.092473  
V2179           0.008602 -0.014649  0.087404  
V13             0.025354  0.007890 -0.028204  
V2143          -0.002386 -0.005256 -0.027044  
V2134           0.000950 -0.011670 -0.032677  
V2172          -0.149836  0.015815 -0.106495  
V2137           0.002346 -0.014636 -0.032821  
V2140          -0.011394 -0.029016 -0.042198  
V2105           0.043009 -0.034165  0.001628  
V2157           0.044792  0.003419  0.269085  
V2183           0.174220 -0.007107  0.142227  
V2187          -0.072245  0.016330 -0.025125  
V2181          -0.080834  0.012776 -0.050770  
V2152           1.000000 -0.012334  0.094300  
V2153          -0.012334  1.000000  0.018627  
V2156           0.094300  0.018627  1.000000  

[50 rows x 50 columns]
No description has been provided for this image
Highly correlated variable pairs (|corr| > 0.5):
  Variable1 Variable2  Correlation
0     V2188     V2183     0.536921
1     V2184     V2189     0.583240
2     V2184     V2183     0.526226
3     V2186     V2181     0.534619
4     V2173     V2179     0.538661
5     V2116     V2105     0.532954
6     V2182     V2187     0.594918
7     V2185     V2180     0.507020
8     V2108     V2105     0.542470
In [ ]:
# --- Identify & Convert Categorical Columns ---
import logging

# Identify all categorical (object or categorical dtype) columns
categorical_predictor_cols = new_data.select_dtypes(include=['object', 'category']).columns.tolist()

# If you also want to include numerical columns as categorical (optional)
# categorical_predictor_cols = new_data.columns.tolist()  

# Convert identified columns to categorical
convert_to_categorical(new_data, categorical_predictor_cols)

# Logging information
logging.info("Verifying data types after conversion:")
logging.info(new_data[categorical_predictor_cols].dtypes)

# --- Train/Test Split ---
X = new_data.drop('nicotine12d', axis=1)
y = new_data['nicotine12d']
X_train, X_test, y_train, y_test = create_train_test_split(X, y)

logging.info("Train Set Balance:")
logging.info(y_train.value_counts(normalize=True))
logging.info("Test Set Balance:")
logging.info(y_test.value_counts(normalize=True))

# --- Missing Value Indicators ---
X_train_with_indicators, X_test_with_indicators = create_missing_indicators(X_train, X_test)

# Treat everything as categorical in this example
categorical_features = X_train_with_indicators.columns.tolist()

# Create & Fit Preprocessor
preprocessor = create_preprocessor(categorical_features)
preprocessor.fit(X_train_with_indicators)
logging.info("Preprocessor fitted successfully.")
2025-02-15 11:02:51,511 - INFO - Categorical conversion complete.
2025-02-15 11:02:51,512 - INFO - Verifying data types after conversion:
2025-02-15 11:02:51,513 - INFO - Series([], dtype: object)
2025-02-15 11:02:51,512 - INFO - Verifying data types after conversion:
2025-02-15 11:02:51,513 - INFO - Series([], dtype: object)
2025-02-15 11:02:51,538 - INFO - Training set shape: (26184, 50)
2025-02-15 11:02:51,539 - INFO - Testing set shape: (6546, 50)
2025-02-15 11:02:51,539 - INFO - Train Set Balance:
2025-02-15 11:02:51,541 - INFO - nicotine12d
1.0    0.552589
0.0    0.447411
Name: proportion, dtype: float64
2025-02-15 11:02:51,542 - INFO - Test Set Balance:
2025-02-15 11:02:51,543 - INFO - nicotine12d
1.0    0.552551
0.0    0.447449
Name: proportion, dtype: float64
2025-02-15 11:02:51,567 - INFO - Missing indicators created.
2025-02-15 11:02:52,047 - INFO - Preprocessor fitted successfully.

Model Training¶

Lasso¶

In [ ]:
# Define the preprocessing for numeric columns (scale them)
numeric_features = X_train_with_indicators.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define the preprocessing for categorical features (encode them)
categorical_features = X_train_with_indicators.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create the pipeline
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(penalty='l1', solver='saga'))])
# Define an expanded tuning grid.
# - 'classifier__C': A wide range of regularization strengths.
# - 'classifier__tol': Different tolerance levels for stopping criteria.
# - 'classifier__max_iter': More iterations to ensure convergence.
# - 'preprocessor__cat__drop': Option to drop the first level or keep all levels.
param_grid = {
    'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__tol': [1e-4, 1e-3, 1e-2],
    'classifier__max_iter': [1000, 2000, 5000],
    # Tune whether to drop the first level for categorical features or not.
    'preprocessor__cat__onehot__drop': [None, 'first'],
    # Experiment with class weights (None or 'balanced') to help if classes are imbalanced.
    'classifier__class_weight': [None, 'balanced']
}

# Define a cross-validation strategy.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Initialize GridSearchCV with your pipeline (lasso_pipeline)
grid_search = GridSearchCV(
    estimator=lasso_pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search on the training data.
grid_search.fit(X_train_with_indicators, y_train)

# Display the best parameters and the best ROC AUC achieved during cross-validation.
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC:", grid_search.best_score_)

# Use the best estimator to evaluate performance on the test data.
best_lasso_model = grid_search.best_estimator_
train_evaluate_model(
    model=best_lasso_model,
    X_train=X_train_with_indicators,
    y_train=y_train,
    X_test=X_test_with_indicators,
    y_test=y_test,
    model_name="Tuned LASSO Logistic Regression"
)
Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Best Parameters: {'classifier__C': 100, 'classifier__class_weight': 'balanced', 'classifier__max_iter': 2000, 'classifier__tol': 0.01, 'preprocessor__cat__onehot__drop': 'first'}
Best ROC AUC: 0.745313422854059
2025-02-14 13:46:51,624 - INFO - === Tuned LASSO Logistic Regression Evaluation ===
2025-02-14 13:46:51,632 - INFO - Confusion Matrix:
[[2030  899]
 [ 941 2676]]
2025-02-14 13:46:51,646 - INFO - 
Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.69      0.69      2929
         1.0       0.75      0.74      0.74      3617

    accuracy                           0.72      6546
   macro avg       0.72      0.72      0.72      6546
weighted avg       0.72      0.72      0.72      6546

2025-02-14 13:46:51,651 - INFO - ROC AUC: 0.7425
No description has been provided for this image
Out[ ]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
       'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
       'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE',...
       'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
       'V2152', 'V2153', 'V2156'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  Index([], dtype='object'))])),
                ('classifier',
                 LogisticRegression(C=100, class_weight='balanced',
                                    max_iter=2000, penalty='l1', solver='saga',
                                    tol=0.01))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
       'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
       'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE',...
       'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
       'V2152', 'V2153', 'V2156'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  Index([], dtype='object'))])),
                ('classifier',
                 LogisticRegression(C=100, class_weight='balanced',
                                    max_iter=2000, penalty='l1', solver='saga',
                                    tol=0.01))])
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
       'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
       'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE', 'V2185',
       'V2193', 'V2163', 'V49', 'V...
       'V2195', 'V2155', 'V2196', 'V2189', 'V2179', 'V13', 'V2143', 'V2134',
       'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
       'V2152', 'V2153', 'V2156'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'))]),
                                 Index([], dtype='object'))])
Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
       'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
       'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE', 'V2185',
       'V2193', 'V2163', 'V49', 'V2108', 'V2101', 'V2180', 'V2164', 'V2191',
       'V2195', 'V2155', 'V2196', 'V2189', 'V2179', 'V13', 'V2143', 'V2134',
       'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
       'V2152', 'V2153', 'V2156'],
      dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
Index([], dtype='object')
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(drop='first', handle_unknown='ignore')
LogisticRegression(C=100, class_weight='balanced', max_iter=2000, penalty='l1',
                   solver='saga', tol=0.01)
In [ ]:
import numpy as np
import pandas as pd

# Assume 'best_lasso_model' is your already fitted pipeline from GridSearchCV.
# Extract the logistic regression model from the pipeline.
lr = best_lasso_model.named_steps['classifier']

# For binary classification, lr.coef_ has shape (1, n_features)
coefficients = lr.coef_[0]

# Get the preprocessor (the ColumnTransformer) from the pipeline.
preprocessor = best_lasso_model.named_steps['preprocessor']

# -------------------------------
# 1. Numeric Features and Importances
# -------------------------------
# The numeric transformer was applied first.
numeric_features = preprocessor.transformers_[0][2]  # list (or Index) of numeric feature names
n_numeric = len(numeric_features)
numeric_coefs = coefficients[:n_numeric]
numeric_importances = pd.Series(np.abs(numeric_coefs), index=numeric_features)

# -------------------------------
# 2. Categorical Features (Aggregation)
# -------------------------------
# Get the original categorical columns from the transformer.
cat_features = preprocessor.transformers_[1][2]

# Check if there are any categorical features
if len(cat_features) > 0:
    # Retrieve the OneHotEncoder from the categorical pipeline.
    onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
    
    # The remaining coefficients correspond to the one-hot encoded features.
    categorical_coefs = coefficients[n_numeric:]
    
    aggregated_cat_importance = {}
    start_idx = 0
    # Loop over each original categorical feature and its categories.
    for feature, categories in zip(cat_features, onehot_encoder.categories_):
        n_categories = len(categories)
        # Get the coefficients for the dummy columns of this feature.
        feature_coefs = categorical_coefs[start_idx:start_idx + n_categories]
        # Aggregate by summing the absolute values.
        aggregated_cat_importance[feature] = np.sum(np.abs(feature_coefs))
        start_idx += n_categories

    aggregated_cat_importance = pd.Series(aggregated_cat_importance)
else:
    # If there are no categorical features, create an empty Series.
    aggregated_cat_importance = pd.Series(dtype=float)

# -------------------------------
# 3. Combine and Select Top 20
# -------------------------------
combined_importances = pd.concat([numeric_importances, aggregated_cat_importance])
top20_features = combined_importances.sort_values(ascending=False).head(20)

print("Top 20 Aggregated Feature Importances (by absolute coefficient value):")
print(top20_features)

# Plot the top 20 features
plt.figure(figsize=(10, 6))
sns.barplot(x=top20_features.values, y=top20_features.index, palette="viridis")
plt.title('Top 20 Aggregated Feature Importances (by absolute coefficient value)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
Top 20 Aggregated Feature Importances (by absolute coefficient value):
wave     0.917893
V2116    0.204811
V2137    0.107522
sex      0.096266
V2105    0.092966
V2166    0.086576
V2134    0.079518
V2128    0.074483
V13      0.067082
V2101    0.058904
V2143    0.052490
V2176    0.051462
V2187    0.046222
V2179    0.045720
V2193    0.045389
V2153    0.045158
V2188    0.039777
V2157    0.038560
V2182    0.038260
V2194    0.037967
dtype: float64
/tmp/ipykernel_1545623/809380159.py:64: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top20_features.values, y=top20_features.index, palette="viridis")
No description has been provided for this image
In [ ]:
#########################################
# 2. Permutation Importance (Aggregated by Original Feature)
#########################################

from sklearn.inspection import permutation_importance

# Compute permutation importance using the original features (X_test_with_indicators).
perm_results = permutation_importance(
    best_lasso_model,
    X_test_with_indicators,
    y_test,
    scoring='roc_auc',
    n_repeats=10,
    random_state=RANDOM_STATE
)

perm_imp_df = pd.DataFrame({
    'Feature': X_test_with_indicators.columns,
    'Importance': perm_results.importances_mean
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=perm_imp_df.head(20), palette='magma')
plt.title("Permutation Importance (Aggregated Original Features)")
plt.xlabel("Mean Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Display the top 20 features by permutation importance
print(perm_imp_df.head(20))
/tmp/ipykernel_1489303/241353914.py:23: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=perm_imp_df.head(20), palette='magma')
No description has been provided for this image
   Feature  Importance
11    wave    0.218731
15   V2116    0.016337
42   V2105    0.004965
40   V2137    0.003549
38   V2134    0.002059
10   V2166    0.001284
18     sex    0.001066
6    V2128    0.000694
35   V2179    0.000675
1    V2188    0.000622
45   V2187    0.000611
23   V2193    0.000493
26   V2108    0.000476
8    V2173    0.000444
43   V2157    0.000364
13   V2175    0.000297
36     V13    0.000294
7    V2201    0.000282
30   V2191    0.000265
31   V2195    0.000249
In [ ]:
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature names (only using available features)
feature_names = numeric_features

# Calculate SHAP values for numeric features
explainer = shap.LinearExplainer(
    best_lasso_model.named_steps['classifier'],
    best_lasso_model.named_steps['preprocessor'].transform(X_train_with_indicators)
)
shap_values = explainer.shap_values(
    best_lasso_model.named_steps['preprocessor'].transform(X_train_with_indicators)
)

# Calculate feature importance (using absolute mean SHAP values)
feature_importance = {}
for idx, feature in enumerate(feature_names):
    if idx < shap_values.shape[1]:  # Only process features within bounds
        feature_importance[feature] = np.abs(shap_values[:, idx]).mean()

# Convert to DataFrame and sort
importance_df = pd.DataFrame({
    'Feature': list(feature_importance.keys()),
    'Importance': list(feature_importance.values())
}).sort_values('Importance', ascending=False)

# Display top 20 features
print("\nTop 20 Important Features:")
print(importance_df.head(20))

# Create visualization
plt.figure(figsize=(12, 8))
sns.barplot(data=importance_df.head(20), x='Importance', y='Feature')
plt.title('Top 20 Feature Importance Based on SHAP Values')
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Top 20 Important Features:
   Feature  Importance
11    wave    0.831211
15   V2116    0.154883
18     sex    0.096673
42   V2105    0.079273
10   V2166    0.075389
36     V13    0.059174
27   V2101    0.049204
45   V2187    0.042473
43   V2157    0.037265
1    V2188    0.036909
35   V2179    0.035972
12   V2176    0.033787
23   V2193    0.033327
17   V2182    0.032703
9    V2194    0.029956
30   V2191    0.027242
48   V2153    0.026061
8    V2173    0.024039
13   V2175    0.023854
40   V2137    0.021965
No description has been provided for this image
In [ ]:
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt

top_10_features = importance_df.head(10)['Feature'].tolist()

print("Top 10 Features:", top_10_features)
for feature in top_10_features:
    print(f"{feature}: {X_train_with_indicators[feature].dtype}")

print("Unique Values in Top 10 Features:")
for feature in top_10_features:
    unique_values = X_train_with_indicators[feature].unique()
    print(f"{feature}: {unique_values}")

fig, axes = plt.subplots(2, 5, figsize=(20, 10))
axes = axes.flatten()
for i, feature in enumerate(top_10_features):
    if len(X_train_with_indicators[feature].unique()) > 1:
        try:
            PartialDependenceDisplay.from_estimator(
                best_lasso_model,
                X_train_with_indicators,
                features=[feature],
                ax=axes[i]
            )
            axes[i].set_title(f'PDP for {feature}')
        except ValueError as e:
            print(f"Error plotting PDP for {feature}: {e}")
    else:
        print(f"Skipping PDP for {feature}: Only one unique value in the dataset.")
        axes[i].set_visible(False)

plt.tight_layout()
plt.show()
Top 10 Features: ['wave', 'V2116', 'sex', 'V2105', 'V2166', 'V13', 'V2101', 'V2187', 'V2157', 'V2188']
wave: int64
V2116: float64
sex: float64
V2105: float64
V2166: float64
V13: int64
V2101: float64
V2187: float64
V2157: float64
V2188: float64
Unique Values in Top 10 Features:
wave: [2019 2018 2017 2022 2023 2021 2020]
V2116: [nan  2.  1.  3.  7.  4.  6.  5.]
sex: [0. 1.]
V2105: [ 1.  6.  2.  3.  4.  5.  7. nan]
V2166: [nan  2.  4.  6.  1.  5.  3.  8.  7.]
V13: [2 4 3 1]
V2101: [ 1.  2.  3.  4. nan  5.]
V2187: [ 0.  1. nan]
V2157: [ 1.  0. nan]
V2188: [ 1.  0. nan]
Error plotting PDP for sex: cannot reshape array of size 1 into shape (2)
No description has been provided for this image
In [ ]:
#############################
# Degree 2 Interaction 
###############################

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Set a random state for reproducibility.
RANDOM_STATE = 42

# ----------------------------
# 1. Build the Pipeline
# ----------------------------
# This pipeline consists of:
#  - preprocessor: your existing preprocessor for data cleaning/encoding.
#  - poly: PolynomialFeatures with degree 2 (pairwise interactions only, no bias).
#  - classifier: LogisticRegression with L1 penalty (sparse model) using the liblinear solver.
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('classifier', LogisticRegression(
        penalty='l1',
        solver='liblinear',
        random_state=RANDOM_STATE,
        max_iter=1000
    ))
])

# ----------------------------
# 2. Set Up Hyperparameter Tuning
# ----------------------------
# Here we define a parameter grid for tuning.
# In this example, we tune the inverse regularization strength 'C' for logistic regression.
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear'],  # 'saga' could also be tested if using larger datasets
    'classifier__max_iter': [500, 1000],
    'classifier__tol': [1e-4, 1e-3, 1e-2]
}


# ----------------------------
# 3. Create and Fit GridSearchCV
# ----------------------------
# We use 5-fold cross-validation and accuracy as the scoring metric.
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,    # Use all available CPU cores.
    verbose=1
)

# Fit the grid search on the training data.
grid_search.fit(X_train_with_indicators, y_train)

# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_accuracy))
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.01}
Best cross-validation accuracy: 0.8037
Test set accuracy: 0.8093
In [ ]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    classification_report,
    roc_curve  # For ROC curve
)
import matplotlib.pyplot as plt  # For plotting

# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
y_pred_proba = best_model.predict_proba(X_test_with_indicators)[:, 1]  # Get probabilities for the positive class

# Calculate additional metrics
test_accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy: {:.4f}".format(test_accuracy))
print("Test set ROC AUC: {:.4f}".format(roc_auc))
print("Test set F1 score: {:.4f}".format(f1))

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ----------------------------
# 5. Plot the ROC Curve
# ----------------------------
# Compute FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")  # Diagonal line for reference
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.01}
Best cross-validation accuracy: 0.8037
Test set accuracy: 0.8093
Test set ROC AUC: 0.8758
Test set F1 score: 0.8286

Classification Report:
              precision    recall  f1-score   support

         0.0       0.79      0.78      0.79      2929
         1.0       0.82      0.83      0.83      3617

    accuracy                           0.81      6546
   macro avg       0.81      0.81      0.81      6546
weighted avg       0.81      0.81      0.81      6546

No description has been provided for this image
In [ ]:
# -----------------------------
# Step 1: Retrieve One-Hot Encoded Feature Names
# -----------------------------
# Access the pipeline for categorical features
num_pipeline = best_model.named_steps['preprocessor'].named_transformers_['num']
# Then access the OneHotEncoder within that pipeline
ohe = num_pipeline.named_steps['scaler']
# Now get the encoded feature names
encoded_feature_names = ohe.get_feature_names_out(numeric_features)


# -----------------------------
# Step 2: Retrieve Interaction Feature Names
# -----------------------------
# Get the feature names after applying PolynomialFeatures (which created interaction terms)
interaction_transformer = best_model.named_steps['poly']  # Corrected step name here
interaction_feature_names = interaction_transformer.get_feature_names_out(encoded_feature_names)

# -----------------------------
# Step 3: Extract Classifier Coefficients
# -----------------------------
# For binary classification, the classifier’s coef_ is an array of shape (1, n_features)
coefficients = best_model.named_steps['classifier'].coef_[0]

# Build a DataFrame mapping each expanded feature (both main effects and interactions) to its coefficient
features_df = pd.DataFrame({
    'interaction_feature': interaction_feature_names,
    'coefficient': coefficients,
    'abs_coef': np.abs(coefficients)
})

# -----------------------------
# Step 4: Filter for Interaction Features Only
# -----------------------------
# With interaction_only=True, main effects do not contain a space, while interaction terms do.
interaction_df = features_df[features_df['interaction_feature'].str.contains(' ')].copy()

# -----------------------------
# Step 5: Aggregate to Original Feature Combinations
# -----------------------------
# Define a function to extract the original feature names from an interaction term.
def extract_original_features(interaction_term):
    # For example, "V13_A V2152_B" or "V13_A V2152_B V49_Low"
    parts = interaction_term.split(' ')
    # Extract the original feature name from each part (everything before the underscore)
    original_features = [part.split('_')[0] for part in parts]
    # Sort the features so order doesn't matter (e.g., ('V13', 'V2152') is the same as ('V2152', 'V13'))
    return tuple(sorted(original_features))

# Create a new column for the aggregated original feature combination
interaction_df['feature_combination'] = interaction_df['interaction_feature'].apply(extract_original_features)

# Group by the original feature combination and sum the absolute coefficient values as a measure of importance
agg_interactions = (
    interaction_df.groupby('feature_combination')['abs_coef']
    .sum()
    .reset_index()
    .rename(columns={'abs_coef': 'aggregated_importance'})
)

# Sort the aggregated interactions by importance in descending order
agg_interactions = agg_interactions.sort_values('aggregated_importance', ascending=False)

# -----------------------------
# Step 6: Display the Top 20 Aggregated Interaction Features
# -----------------------------
top20_agg_interactions = agg_interactions.head(20)
print("Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:")
print(top20_agg_interactions)

# Optionally, plot the results.
plt.figure(figsize=(10, 6))
sns.barplot(
    x='aggregated_importance',
    y=top20_agg_interactions['feature_combination'].astype(str),
    data=top20_agg_interactions,
    palette='viridis'
)
plt.title("Top 20 Aggregated Interaction Features")
plt.xlabel("Aggregated Importance (Sum of |Coefficients|)")
plt.ylabel("Feature Combination")
plt.tight_layout()
plt.show()
Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:
     feature_combination  aggregated_importance
189        (V2105, wave)               0.664864
278        (V2116, wave)               0.587994
143        (V2101, wave)               0.406701
1203       (V2196, wave)               0.244442
1188       (V2194, wave)               0.116426
789        (V2166, wave)               0.114012
1214       (V2201, wave)               0.110926
1169       (V2191, wave)               0.092729
1088       (V2184, wave)               0.070952
1196       (V2195, wave)               0.064390
873        (V2173, wave)               0.062596
96           (V13, wave)               0.058666
1224         (sex, wave)               0.042238
924        (V2176, wave)               0.038159
1146       (V2188, wave)               0.035029
818        (V2171, wave)               0.031464
899        (V2175, wave)               0.029978
1119       (V2186, wave)               0.027526
558        (V2152, wave)               0.027012
1181      (V2194, V2196)               0.026777
/tmp/ipykernel_1545623/3475757056.py:73: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image
In [ ]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Set a random state for reproducibility.
RANDOM_STATE = 42

# ----------------------------
# 1. Build the Pipeline
# ----------------------------
# This pipeline consists of:
#  - preprocessor: your existing preprocessor for data cleaning/encoding.
#  - poly: PolynomialFeatures with degree 2 (pairwise interactions only, no bias).
#  - classifier: LogisticRegression with L1 penalty (sparse model) using the liblinear solver.
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)),
    ('classifier', LogisticRegression(
        penalty='l1',
        solver='liblinear',
        random_state=RANDOM_STATE,
        max_iter=500
    ))
])

# ----------------------------
# 2. Set Up Hyperparameter Tuning
# ----------------------------
# Here we define a parameter grid for tuning.
# In this example, we tune the inverse regularization strength 'C' for logistic regression.
param_grid = {
    'classifier__C': [0.01, 0.1, 1],
    'classifier__penalty': ['l1'],
    'classifier__solver': ['liblinear'],
    'classifier__max_iter': [500, 1000],
    'classifier__tol': [1e-4]
}


# ----------------------------
# 3. Create and Fit GridSearchCV
# ----------------------------
# We use 5-fold cross-validation and accuracy as the scoring metric.
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=8,    # need to adjust accordingly to precent memory overflow.
    verbose=1
)

# Fit the grid search on the training data.
grid_search.fit(X_train_with_indicators, y_train)

# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_accuracy))
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.0001}
Best cross-validation accuracy: 0.8030
Test set accuracy: 0.8008
In [ ]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    classification_report,
    roc_curve  # For ROC curve
)
import matplotlib.pyplot as plt  # For plotting

# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
y_pred_proba = best_model.predict_proba(X_test_with_indicators)[:, 1]  # Get probabilities for the positive class

# Calculate additional metrics
test_accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy: {:.4f}".format(test_accuracy))
print("Test set ROC AUC: {:.4f}".format(roc_auc))
print("Test set F1 score: {:.4f}".format(f1))

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ----------------------------
# 5. Plot the ROC Curve
# ----------------------------
# Compute FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")  # Diagonal line for reference
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.0001}
Best cross-validation accuracy: 0.8030
Test set accuracy: 0.8008
Test set ROC AUC: 0.8729
Test set F1 score: 0.8210

Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.77      0.78      2929
         1.0       0.82      0.83      0.82      3617

    accuracy                           0.80      6546
   macro avg       0.80      0.80      0.80      6546
weighted avg       0.80      0.80      0.80      6546

No description has been provided for this image
In [ ]:
# -----------------------------
# Step 1: Retrieve One-Hot Encoded Feature Names
# -----------------------------
# Access the pipeline for categorical features
num_pipeline = best_model.named_steps['preprocessor'].named_transformers_['num']
# Then access the OneHotEncoder within that pipeline
ohe = num_pipeline.named_steps['scaler']
# Now get the encoded feature names
encoded_feature_names = ohe.get_feature_names_out(numeric_features)

# -----------------------------
# Step 2: Retrieve Interaction Feature Names
# -----------------------------
# Get the feature names after applying PolynomialFeatures (which created interaction terms)
interaction_transformer = best_model.named_steps['poly']  # Corrected step name here
interaction_feature_names = interaction_transformer.get_feature_names_out(encoded_feature_names)

# -----------------------------
# Step 3: Extract Classifier Coefficients
# -----------------------------
# For binary classification, the classifier’s coef_ is an array of shape (1, n_features)
coefficients = best_model.named_steps['classifier'].coef_[0]

# Build a DataFrame mapping each expanded feature (both main effects and interactions) to its coefficient
features_df = pd.DataFrame({
    'interaction_feature': interaction_feature_names,
    'coefficient': coefficients,
    'abs_coef': np.abs(coefficients)
})

# -----------------------------
# Step 4: Filter for Interaction Features Only
# -----------------------------
# With interaction_only=True, main effects do not contain a space, while interaction terms do.
interaction_df = features_df[features_df['interaction_feature'].str.contains(' ')].copy()

# -----------------------------
# Step 5: Aggregate to Original Feature Combinations
# -----------------------------
# Define a function to extract the original feature names from an interaction term.
def extract_original_features(interaction_term):
    # For example, "V13_A V2152_B" or "V13_A V2152_B V49_Low"
    parts = interaction_term.split(' ')
    # Extract the original feature name from each part (everything before the underscore)
    original_features = [part.split('_')[0] for part in parts]
    # Sort the features so order doesn't matter (e.g., ('V13', 'V2152') is the same as ('V2152', 'V13'))
    return tuple(sorted(original_features))

# Create a new column for the aggregated original feature combination
interaction_df['feature_combination'] = interaction_df['interaction_feature'].apply(extract_original_features)

# Group by the original feature combination and sum the absolute coefficient values as a measure of importance
agg_interactions = (
    interaction_df.groupby('feature_combination')['abs_coef']
    .sum()
    .reset_index()
    .rename(columns={'abs_coef': 'aggregated_importance'})
)

# Sort the aggregated interactions by importance in descending order
agg_interactions = agg_interactions.sort_values('aggregated_importance', ascending=False)

# -----------------------------
# Step 6: Display the Top 20 Aggregated Interaction Features
# -----------------------------
top20_agg_interactions = agg_interactions.head(20)
print("Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:")
print(top20_agg_interactions)

# Optionally, plot the results.
plt.figure(figsize=(10, 6))
sns.barplot(
    x='aggregated_importance',
    y=top20_agg_interactions['feature_combination'].astype(str),
    data=top20_agg_interactions,
    palette='viridis'
)
plt.title("Top 20 Aggregated Interaction Features")
plt.xlabel("Aggregated Importance (Sum of |Coefficients|)")
plt.ylabel("Feature Combination")
plt.tight_layout()
plt.show()
Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:
        feature_combination  aggregated_importance
6634          (V2116, wave)               0.732379
4609          (V2105, wave)               0.708883
3528          (V2101, wave)               0.525570
20768         (V2196, wave)               0.228015
3619   (V2105, V2116, wave)               0.192851
5644          (V2108, wave)               0.167442
20704         (V2194, wave)               0.127520
20804         (V2201, wave)               0.115384
2447   (V2101, V2105, wave)               0.102783
16329         (V2166, wave)               0.098145
20604         (V2191, wave)               0.092698
20740         (V2195, wave)               0.084765
3574   (V2105, V2108, wave)               0.083104
2397      (V13, race, wave)               0.080824
18224         (V2176, wave)               0.079418
17548         (V2173, wave)               0.063086
2538   (V2101, V2116, wave)               0.050391
14872  (V2163, V2164, wave)               0.047727
20008         (V2184, wave)               0.047025
20549  (V2191, V2193, wave)               0.042181
/tmp/ipykernel_1545623/3089346036.py:72: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image

Random Forest Classifier¶

In [ ]:
# Define Random State for reproducibility
RANDOM_STATE = 42

# Use RepeatedStratifiedKFold for more robust validation
N_SPLITS_CV = 5
N_REPEATS = 1  # repeat the CV multiple times if desired
SCORING_METRIC = 'roc_auc'
VERBOSE = 1

logging.info("\n--- Random Forest (Revised) ---")

# Build pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])

# Parameter for RandomizedSearch
rf_param_dist = {
    'classifier__n_estimators': [100, 200, 500, 1000],
    'classifier__max_depth': [5, 10, 20, 50],
    'classifier__min_samples_split': [2, 5, 10, 20],
    'classifier__min_samples_leaf': [1, 2, 5, 10],
    'classifier__max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7],  # Mix of float and string
    'classifier__bootstrap': [True, False],
    'classifier__class_weight': [None, 'balanced']
}

try:
    logging.info("Starting randomized search for Random Forest...")
    
    # Use RepeatedStratifiedKFold without shuffle
    cv_rf = RepeatedStratifiedKFold(
        n_splits=N_SPLITS_CV, 
        n_repeats=N_REPEATS, 
        random_state=RANDOM_STATE
    )
    
    # RandomizedSearchCV to cover more combinations within reasonable compute time
    rf_random_search = RandomizedSearchCV(
        estimator=rf_pipeline,
        param_distributions=rf_param_dist,
        n_iter=30,  # Increase or decrease based on resources
        cv=cv_rf,
        scoring=SCORING_METRIC,
        n_jobs=-1,  # Use all available cores
        random_state=RANDOM_STATE,
        verbose=VERBOSE
    )
    
    # Fit the RandomizedSearchCV
    rf_random_search.fit(X_train_with_indicators, y_train)

    logging.info(f"Best parameters (RF): {rf_random_search.best_params_}")
    logging.info(f"Best cross-validation {SCORING_METRIC}: {rf_random_search.best_score_:.4f}")
    
    # Extract the best estimator
    best_rf = rf_random_search.best_estimator_

except Exception as e:
    logging.error(f"An error occurred during Random Forest randomized search: {e}")
    raise

# Evaluate the best Random Forest
try:
    best_rf.fit(X_train_with_indicators, y_train)
    y_pred_rf = best_rf.predict(X_test_with_indicators)
    y_pred_proba_rf = best_rf.predict_proba(X_test_with_indicators)[:, 1]

    logging.info("=== Best Random Forest Evaluation ===")
    logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred_rf)))
    logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred_rf)))
    logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")

    # Plot ROC
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_rf, tpr_rf, label=f'AUC = {roc_auc_score(y_test, y_pred_proba_rf):.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Random Forest ROC Curve on Test Data')
    plt.legend(loc='lower right')
    plt.show()

except Exception as e:
    logging.error(f"An error occurred during Random Forest training/evaluation: {e}")
    raise

logging.info("Script completed successfully.")
2025-02-14 11:32:08,285 - INFO - 
--- Random Forest (Revised) ---
2025-02-14 11:32:08,287 - INFO - Starting randomized search for Random Forest...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  warnings.warn(
2025-02-14 11:34:57,748 - INFO - Best parameters (RF): {'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.3, 'classifier__max_depth': 20, 'classifier__class_weight': None, 'classifier__bootstrap': False}
2025-02-14 11:34:57,750 - INFO - Best cross-validation roc_auc: 0.9151
2025-02-14 11:35:09,935 - INFO - === Best Random Forest Evaluation ===
2025-02-14 11:35:09,942 - INFO - Confusion Matrix:
[[2373  556]
 [ 543 3074]]
2025-02-14 11:35:09,955 - INFO - 
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81      2929
         1.0       0.85      0.85      0.85      3617

    accuracy                           0.83      6546
   macro avg       0.83      0.83      0.83      6546
weighted avg       0.83      0.83      0.83      6546

2025-02-14 11:35:09,960 - INFO - ROC AUC: 0.9139
No description has been provided for this image
2025-02-14 11:35:10,129 - INFO - Script completed successfully.
In [ ]:
# Define the model file path
model_filename = os.path.expanduser('~/work/vaping_project_data/best_rf_model.joblib')

# Save the trained model
joblib.dump(best_rf, model_filename)
logging.info(f"Model saved to {model_filename}")
2025-02-14 11:35:21,792 - INFO - Model saved to /storage/home/szn5432/work/vaping_project_data/best_rf_model.joblib
In [ ]:
# Load the model from the specified path
logging.info("Loading the model...")
best_rf = joblib.load(os.path.expanduser('~/work/vaping_project_data/best_rf_model.joblib'))
logging.info("Model loaded successfull")
2025-02-14 11:35:23,382 - INFO - Loading the model...
2025-02-14 11:35:23,571 - INFO - Model loaded successfull
In [ ]:
try:
    logging.info("Starting feature importance analysis...")

    # Access the RandomForestClassifier from the pipeline
    rf_model = best_rf.named_steps['classifier']

    # Get feature importances
    feature_importance = rf_model.feature_importances_

    # Access the preprocessor step
    preprocessor = best_rf.named_steps['preprocessor']

    # Get transformed feature names
    if hasattr(preprocessor, 'get_feature_names_out'):
        feature_names = preprocessor.get_feature_names_out()
    else:
        # Fallback: Generate feature names if get_feature_names_out is not available
        X_train_transformed = preprocessor.transform(X_train_with_indicators)
        feature_names = [f"Feature_{idx}" for idx in range(X_train_transformed.shape[1])]

    # Debugging: Print shapes and lengths
    logging.info(f"Shape of X_train_with_indicators: {X_train_with_indicators.shape}")
    logging.info(f"Length of feature_importance: {len(feature_importance)}")
    logging.info(f"Number of feature names: {len(feature_names)}")
    logging.info(f"Feature names: {feature_names}")

    # Check if lengths match
    if len(feature_names) != len(feature_importance):
        raise ValueError(
            f"Mismatch in lengths: feature_names ({len(feature_names)}) != feature_importance ({len(feature_importance)})"
        )

    # Create a DataFrame for visualization
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    })

    # Aggregate importances for original features
    original_feature_importance = {}

    for feature, importance in zip(feature_names, feature_importance):
        # Extract the original feature name (e.g., 'cat__V13_2' -> 'V13')
        original_feature = feature.split('__')[1].split('_')[0]

        # Sum importances for each original feature
        if original_feature in original_feature_importance:
            original_feature_importance[original_feature] += importance
        else:
            original_feature_importance[original_feature] = importance

    # Create a DataFrame for aggregated importances
    aggregated_importance_df = pd.DataFrame({
        'Feature': list(original_feature_importance.keys()),
        'Importance': list(original_feature_importance.values())
    })

    # Sort features by importance
    aggregated_importance_df = aggregated_importance_df.sort_values(by='Importance', ascending=False)

    # Plot aggregated feature importance
    plt.figure(figsize=(20, 12))
    sns.barplot(x='Importance', y='Feature', data=aggregated_importance_df, palette='viridis')
    plt.title('Aggregated Feature Importance (Original Features)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()

    # Display top 20 feature importances
    top_20_features = aggregated_importance_df.head(20)
    print("Top 20 Feature Importances:")
    print(top_20_features)

except Exception as e:
    logging.error(f"An error occurred during feature importance analysis: {e}")
    raise
2025-02-14 11:35:31,865 - INFO - Starting feature importance analysis...
2025-02-14 11:35:31,890 - INFO - Shape of X_train_with_indicators: (26184, 100)
2025-02-14 11:35:31,890 - INFO - Length of feature_importance: 50
2025-02-14 11:35:31,891 - INFO - Number of feature names: 50
2025-02-14 11:35:31,891 - INFO - Feature names: ['num__V2178' 'num__V2188' 'num__V2197' 'num__V2184' 'num__V2186'
 'num__V2171' 'num__V2128' 'num__V2201' 'num__V2173' 'num__V2194'
 'num__V2166' 'num__wave' 'num__V2176' 'num__V2175' 'num__V2177'
 'num__V2116' 'num__V2125' 'num__V2182' 'num__sex' 'num__race'
 'num__V2460' 'num__RESPONDENT_AGE' 'num__V2185' 'num__V2193' 'num__V2163'
 'num__V49' 'num__V2108' 'num__V2101' 'num__V2180' 'num__V2164'
 'num__V2191' 'num__V2195' 'num__V2155' 'num__V2196' 'num__V2189'
 'num__V2179' 'num__V13' 'num__V2143' 'num__V2134' 'num__V2172'
 'num__V2137' 'num__V2140' 'num__V2105' 'num__V2157' 'num__V2183'
 'num__V2187' 'num__V2181' 'num__V2152' 'num__V2153' 'num__V2156']
/tmp/ipykernel_1489303/3638803247.py:63: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=aggregated_importance_df, palette='viridis')
No description has been provided for this image
Top 20 Feature Importances:
   Feature  Importance
11    wave    0.504127
15   V2116    0.112886
42   V2105    0.086625
27   V2101    0.060063
26   V2108    0.022534
33   V2196    0.019504
10   V2166    0.014651
19    race    0.014635
30   V2191    0.011908
9    V2194    0.010518
47   V2152    0.009940
31   V2195    0.008595
35   V2179    0.008535
24   V2163    0.007627
29   V2164    0.007486
36     V13    0.007007
23   V2193    0.006488
8    V2173    0.006179
3    V2184    0.006108
12   V2176    0.005942
In [ ]:
# best_rf is the best estimator from your RandomizedSearchCV
tree_model = best_rf.named_steps['classifier']

# Transform the entire training set
X_train_processed_full = best_rf.named_steps['preprocessor'].transform(X_train_with_indicators)

# Convert to DataFrame for easier sampling & feature naming
feature_names = best_rf.named_steps['preprocessor'].get_feature_names_out()
X_train_processed_df = pd.DataFrame(X_train_processed_full, columns=feature_names)

# Randomly sample 5000 rows from the processed data
X_background = X_train_processed_df.sample(n=1000, random_state=42)

# Create the explainer on just the 5000 background points
explainer = shap.TreeExplainer(tree_model, data=X_background)

# If you also want to compute shap values for the same subset (typical):
shap_values = explainer.shap_values(X_background)
shap_values_class1 = shap_values[1]
 99%|===================| 1989/2000 [03:08<00:01]        
In [ ]:
def get_original_feature_name(encoded_name):
    """
    Example parser that assumes you have names like 'cat__Gender_Male'
    or 'cat__MaritalStatus_Single'.

    We split on '__' to separate the transformer name from the remainder.
    Then we split the remainder on '_' and assume the first chunk is the original column
    (e.g., 'Gender' or 'MaritalStatus').
    """
    parts = encoded_name.split("__", maxsplit=1)
    if len(parts) == 2:
        # e.g. 'cat', 'Gender_Male'
        transformer_name, remainder = parts
        # Now parse the remainder by '_' to get the original column
        remainder_parts = remainder.split("_", maxsplit=1)
        original_col = remainder_parts[0]  # e.g. 'Gender'
        return original_col
    else:
        # If there's no double underscore or unexpected format,
        # just return the full encoded_name
        return encoded_name
In [ ]:
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay

top_10_features = top_20_original_features.index[:8].tolist()

# Create a 2-row x 5-column grid of subplots
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 8))

# Flatten the axes array to pass it easily if needed
axs = axs.ravel()

pdp = PartialDependenceDisplay.from_estimator(
    best_rf,
    X_train_with_indicators,
    features=top_10_features,
    feature_names=X_train_with_indicators.columns,
    random_state=42,
    ax=axs  # pass axes array here
)

fig.suptitle("Partial Dependence Plots for Top 10 Features", fontsize=16, y=1.02)
plt.tight_layout()
No description has been provided for this image

Gradient Boosting Trees¶

In [ ]:
import logging
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Random State for reproducibility
RANDOM_STATE = 42

# Use RepeatedStratifiedKFold for more robust validation
N_SPLITS_CV = 5
N_REPEATS = 1  # Repeat the CV multiple times if desired
SCORING_METRIC = 'roc_auc'
VERBOSE = 1

logging.info("\n--- Gradient Boosting (Revised) ---")

# Build pipeline
gbc_pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', GradientBoostingClassifier(random_state=RANDOM_STATE))
])

# Expanded parameter distributions for RandomizedSearch
gbc_param_dist = {
    'classifier__n_estimators': [100, 300, 500, 1000],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 10, 20],
    'classifier__subsample': [0.8, 0.9, 1.0],  # Controls sample ratio per tree
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 5],
    'classifier__max_features': ['sqrt', 'log2', None]
}

try:
    logging.info("Starting randomized search for Gradient Boosting...")

    # Use RepeatedStratifiedKFold without shuffle
    cv_gbc = RepeatedStratifiedKFold(
        n_splits=N_SPLITS_CV, 
        n_repeats=N_REPEATS, 
        random_state=RANDOM_STATE
    )

    # RandomizedSearchCV to cover more combinations within reasonable compute time
    gbc_random_search = RandomizedSearchCV(
        estimator=gbc_pipeline,
        param_distributions=gbc_param_dist,
        n_iter=50,  # Increase or decrease based on resources
        cv=cv_gbc,
        scoring=SCORING_METRIC,
        n_jobs=24,  # Use all available cores
        random_state=RANDOM_STATE,
        verbose=VERBOSE
    )

    # Fit the RandomizedSearchCV
    gbc_random_search.fit(X_train_with_indicators, y_train)

    logging.info(f"Best parameters (GBC): {gbc_random_search.best_params_}")
    logging.info(f"Best cross-validation {SCORING_METRIC}: {gbc_random_search.best_score_:.4f}")

    # Extract the best estimator
    best_gbc = gbc_random_search.best_estimator_

except Exception as e:
    logging.error(f"An error occurred during Gradient Boosting randomized search: {e}")
    raise

# Evaluate the best Gradient Boosting model
try:
    best_gbc.fit(X_train_with_indicators, y_train)
    y_pred_gbc = best_gbc.predict(X_test_with_indicators)
    y_pred_proba_gbc = best_gbc.predict_proba(X_test_with_indicators)[:, 1]

    logging.info("=== Best Gradient Boosting Evaluation ===")
    logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred_gbc)))
    logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred_gbc)))
    logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_gbc):.4f}")

    # Plot ROC Curve
    fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_pred_proba_gbc)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_gbc, tpr_gbc, label=f'AUC = {roc_auc_score(y_test, y_pred_proba_gbc):.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Gradient Boosting ROC Curve on Test Data')
    plt.legend(loc='lower right')
    plt.show()

except Exception as e:
    logging.error(f"An error occurred during Gradient Boosting training/evaluation: {e}")
    raise

logging.info("Script completed successfully.")
2025-02-14 14:35:57,123 - INFO - 
--- Gradient Boosting (Revised) ---
2025-02-14 14:35:57,125 - INFO - Starting randomized search for Gradient Boosting...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  warnings.warn(
2025-02-14 14:45:46,214 - INFO - Best parameters (GBC): {'classifier__subsample': 0.9, 'classifier__n_estimators': 1000, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 5, 'classifier__max_features': None, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.01}
2025-02-14 14:45:46,216 - INFO - Best cross-validation roc_auc: 0.9165
2025-02-14 14:46:53,258 - INFO - === Best Gradient Boosting Evaluation ===
2025-02-14 14:46:53,265 - INFO - Confusion Matrix:
[[2370  559]
 [ 529 3088]]
2025-02-14 14:46:53,278 - INFO - 
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.81      0.81      2929
         1.0       0.85      0.85      0.85      3617

    accuracy                           0.83      6546
   macro avg       0.83      0.83      0.83      6546
weighted avg       0.83      0.83      0.83      6546

2025-02-14 14:46:53,282 - INFO - ROC AUC: 0.9159
No description has been provided for this image
2025-02-14 14:46:53,446 - INFO - Script completed successfully.
In [ ]:
# Define the model file path
model_filename = os.path.expanduser('~/work/vaping_project_data/best_gbt_model.joblib')

# Save the trained model
joblib.dump(best_gbc, model_filename)
logging.info(f"Model saved to {model_filename}")
2025-02-14 14:49:47,488 - INFO - Model saved to /storage/home/szn5432/work/vaping_project_data/best_gbt_model.joblib
In [ ]:
# Load the model (when needed)
file_path = os.path.expanduser('~/work/vaping_project_data/best_gbt_model.joblib')
loaded_gbt = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
# Access the pipeline for categorical features
num_pipeline = loaded_gbt.named_steps['preprocessor'].named_transformers_['num']
# Then access the OneHotEncoder within that pipeline
ohe = num_pipeline.named_steps['scaler']
# Now get the encoded feature names
encoded_feature_names = ohe.get_feature_names_out(numeric_features)
encoded_feature_names
Out[ ]:
array(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128',
       'V2201', 'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175',
       'V2177', 'V2116', 'V2125', 'V2182', 'sex', 'race', 'V2460',
       'RESPONDENT_AGE', 'V2185', 'V2193', 'V2163', 'V49', 'V2108',
       'V2101', 'V2180', 'V2164', 'V2191', 'V2195', 'V2155', 'V2196',
       'V2189', 'V2179', 'V13', 'V2143', 'V2134', 'V2172', 'V2137',
       'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181', 'V2152',
       'V2153', 'V2156'], dtype=object)
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Get the numerical pipeline
num_pipeline = loaded_gbt.named_steps['preprocessor'].named_transformers_['num']

# 2. Get the scaler (StandardScaler or whatever you named it)
scaler = num_pipeline.named_steps['scaler']

# 3. Get the feature names out (they will match one-to-one with your numeric_features)
encoded_feature_names = scaler.get_feature_names_out(numeric_features)

# 4. Get the trained classifier and its feature importances
gbt_classifier = loaded_gbt.named_steps['classifier']
importances = gbt_classifier.feature_importances_

# 5. Build a DataFrame of features vs. importances
feature_importance_df = pd.DataFrame({
    'Feature': encoded_feature_names,
    'Importance': importances
})

# 6. Sort by ascending order of importance
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=True)

# 7. Take top 20 features and plot
top_20 = feature_importance_df.tail(20)

plt.figure(figsize=(12, 8))
plt.barh(y=top_20['Feature'], width=top_20['Importance'])
plt.title('Top 20 Most Important Original Features')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 8. Print the DataFrame in descending order for readability
print("\nFeature Importance Rankings:")
print(feature_importance_df.sort_values('Importance', ascending=False))
No description has been provided for this image
Feature Importance Rankings:
           Feature  Importance
15           V2116    0.333590
11            wave    0.318799
42           V2105    0.111097
27           V2101    0.084814
19            race    0.024483
33           V2196    0.017528
26           V2108    0.011029
10           V2166    0.010013
9            V2194    0.009626
47           V2152    0.005866
30           V2191    0.005776
36             V13    0.004605
31           V2195    0.004590
29           V2164    0.004188
35           V2179    0.004186
7            V2201    0.003781
12           V2176    0.003388
2            V2197    0.003107
8            V2173    0.003022
23           V2193    0.002970
24           V2163    0.002806
18             sex    0.002401
13           V2175    0.002381
3            V2184    0.002251
0            V2178    0.002250
6            V2128    0.002124
14           V2177    0.002117
17           V2182    0.001385
46           V2181    0.001374
39           V2172    0.001293
28           V2180    0.001276
25             V49    0.001213
37           V2143    0.001157
44           V2183    0.001075
48           V2153    0.000920
21  RESPONDENT_AGE    0.000846
43           V2157    0.000752
22           V2185    0.000683
1            V2188    0.000635
40           V2137    0.000580
45           V2187    0.000557
32           V2155    0.000530
49           V2156    0.000512
4            V2186    0.000510
5            V2171    0.000476
38           V2134    0.000447
34           V2189    0.000418
16           V2125    0.000411
20           V2460    0.000104
41           V2140    0.000057
In [ ]:
# 1) Sort by descending importance and take top 10.
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
top_10_features = feature_importance_df['Feature'].head(13).tolist()

# 2) Exclude features that start with "missing_" 
#    or that don’t actually exist in X_train_with_indicators.
filtered_features = [
    f for f in top_10_features 
    if not f.startswith('missing_') and f in X_train_with_indicators.columns
]

print("Top 10 original features:", top_10_features)
print("Filtered features (excluding 'missing_'): ", filtered_features)

# 3) Plot PDPs for the filtered features
n_features = len(filtered_features)
n_rows = (n_features + 1) // 2  # so we can arrange them in a grid
n_cols = 2
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(12, 4*n_rows), squeeze=False)
axes = axes.flatten()

for i, feat in enumerate(filtered_features):
    ax = axes[i]
    PartialDependenceDisplay.from_estimator(
        estimator=loaded_gbt,
        X=X_train_with_indicators,
        features=[feat],
        feature_names=X_train_with_indicators.columns,
        kind='average',  # or 'both' if you want ICE lines
        ax=ax
    )
    ax.set_title(f"PDP for {feat}")

# If there are unused subplots, hide them
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()
Top 10 original features: ['V2116', 'wave', 'V2105', 'V2101', 'race', 'V2196', 'V2108', 'V2166', 'V2194', 'V2152', 'V2191', 'V13', 'V2195']
Filtered features (excluding 'missing_'):  ['V2116', 'wave', 'V2105', 'V2101', 'race', 'V2196', 'V2108', 'V2166', 'V2194', 'V2152', 'V2191', 'V13', 'V2195']
No description has been provided for this image
In [ ]:
X_train_transformed = preprocessor.transform(X_train_with_indicators)
print("Shape of X_train_transformed:", X_train_transformed.shape)
print("Dtypes (if it is a NumPy array):", X_train_transformed.dtype)

# If X_train_transformed is a Pandas DataFrame:
if hasattr(X_train_transformed, 'dtypes'):
    print(X_train_transformed.dtypes)

X_train_with_indicators.info()  # or X_train_with_indicators.isna().sum() if DataFrame
Shape of X_train_transformed: (26184, 50)
Dtypes (if it is a NumPy array): float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26184 entries, 0 to 26183
Data columns (total 100 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   V2178                   25473 non-null  float64
 1   V2188                   25162 non-null  float64
 2   V2197                   24821 non-null  float64
 3   V2184                   24562 non-null  float64
 4   V2186                   25162 non-null  float64
 5   V2171                   25690 non-null  float64
 6   V2128                   25948 non-null  float64
 7   V2201                   24792 non-null  float64
 8   V2173                   25486 non-null  float64
 9   V2194                   25142 non-null  float64
 10  V2166                   25505 non-null  float64
 11  wave                    26184 non-null  int64  
 12  V2176                   24467 non-null  float64
 13  V2175                   24912 non-null  float64
 14  V2177                   24843 non-null  float64
 15  V2116                   25637 non-null  float64
 16  V2125                   25846 non-null  float64
 17  V2182                   24349 non-null  float64
 18  sex                     26184 non-null  float64
 19  race                    26184 non-null  float64
 20  V2460                   25156 non-null  float64
 21  RESPONDENT_AGE          26137 non-null  float64
 22  V2185                   25162 non-null  float64
 23  V2193                   23619 non-null  float64
 24  V2163                   25864 non-null  float64
 25  V49                     26007 non-null  float64
 26  V2108                   25138 non-null  float64
 27  V2101                   25965 non-null  float64
 28  V2180                   24492 non-null  float64
 29  V2164                   25852 non-null  float64
 30  V2191                   25206 non-null  float64
 31  V2195                   25038 non-null  float64
 32  V2155                   25956 non-null  float64
 33  V2196                   25035 non-null  float64
 34  V2189                   25162 non-null  float64
 35  V2179                   25454 non-null  float64
 36  V13                     26184 non-null  int64  
 37  V2143                   25996 non-null  float64
 38  V2134                   25961 non-null  float64
 39  V2172                   25550 non-null  float64
 40  V2137                   25919 non-null  float64
 41  V2140                   26040 non-null  float64
 42  V2105                   25365 non-null  float64
 43  V2157                   25956 non-null  float64
 44  V2183                   24976 non-null  float64
 45  V2187                   25162 non-null  float64
 46  V2181                   24370 non-null  float64
 47  V2152                   26108 non-null  float64
 48  V2153                   25963 non-null  float64
 49  V2156                   25956 non-null  float64
 50  missing_V2178           26184 non-null  bool   
 51  missing_V2188           26184 non-null  bool   
 52  missing_V2197           26184 non-null  bool   
 53  missing_V2184           26184 non-null  bool   
 54  missing_V2186           26184 non-null  bool   
 55  missing_V2171           26184 non-null  bool   
 56  missing_V2128           26184 non-null  bool   
 57  missing_V2201           26184 non-null  bool   
 58  missing_V2173           26184 non-null  bool   
 59  missing_V2194           26184 non-null  bool   
 60  missing_V2166           26184 non-null  bool   
 61  missing_wave            26184 non-null  bool   
 62  missing_V2176           26184 non-null  bool   
 63  missing_V2175           26184 non-null  bool   
 64  missing_V2177           26184 non-null  bool   
 65  missing_V2116           26184 non-null  bool   
 66  missing_V2125           26184 non-null  bool   
 67  missing_V2182           26184 non-null  bool   
 68  missing_sex             26184 non-null  bool   
 69  missing_race            26184 non-null  bool   
 70  missing_V2460           26184 non-null  bool   
 71  missing_RESPONDENT_AGE  26184 non-null  bool   
 72  missing_V2185           26184 non-null  bool   
 73  missing_V2193           26184 non-null  bool   
 74  missing_V2163           26184 non-null  bool   
 75  missing_V49             26184 non-null  bool   
 76  missing_V2108           26184 non-null  bool   
 77  missing_V2101           26184 non-null  bool   
 78  missing_V2180           26184 non-null  bool   
 79  missing_V2164           26184 non-null  bool   
 80  missing_V2191           26184 non-null  bool   
 81  missing_V2195           26184 non-null  bool   
 82  missing_V2155           26184 non-null  bool   
 83  missing_V2196           26184 non-null  bool   
 84  missing_V2189           26184 non-null  bool   
 85  missing_V2179           26184 non-null  bool   
 86  missing_V13             26184 non-null  bool   
 87  missing_V2143           26184 non-null  bool   
 88  missing_V2134           26184 non-null  bool   
 89  missing_V2172           26184 non-null  bool   
 90  missing_V2137           26184 non-null  bool   
 91  missing_V2140           26184 non-null  bool   
 92  missing_V2105           26184 non-null  bool   
 93  missing_V2157           26184 non-null  bool   
 94  missing_V2183           26184 non-null  bool   
 95  missing_V2187           26184 non-null  bool   
 96  missing_V2181           26184 non-null  bool   
 97  missing_V2152           26184 non-null  bool   
 98  missing_V2153           26184 non-null  bool   
 99  missing_V2156           26184 non-null  bool   
dtypes: bool(50), float64(48), int64(2)
memory usage: 11.2 MB
In [ ]:
##### SHAP Feature Importance ####

# Extract the GradientBoostingClassifier
gbt_models = loaded_gbt.named_steps['classifier']
# Get preprocessed features
X_preprocessed = loaded_gbt.named_steps['preprocessor'].transform(X_test_with_indicators)
# Create SHAP explainer
explainer = shap.TreeExplainer(gbt_models)
# Calculate SHAP values
shap_values = explainer.shap_values(X_preprocessed)
# Get feature names after preprocessing
feature_names = loaded_gbt.named_steps['preprocessor'].get_feature_names_out()
In [ ]:
# Create visualizations
# Summary plot
shap.summary_plot(shap_values, X_preprocessed, feature_names=feature_names)
# Bar plot of feature importance
shap.summary_plot(shap_values, X_preprocessed, feature_names=feature_names, plot_type='bar')
No description has been provided for this image
No description has been provided for this image
In [ ]:
# 1. Aggregate SHAP values by base feature
feature_importances = {}
for i, col in enumerate(feature_names):
    base_feature = col.replace('num__', '')  # Extract base feature name
    if base_feature not in feature_importances:
        feature_importances[base_feature] = []
    feature_importances[base_feature].extend(np.abs(shap_values[:, i]))

# 2. Calculate mean absolute SHAP value for each base feature
aggregated_importances = {
    feature: np.mean(values) for feature, values in feature_importances.items()
}

# 3. Sort features by importance
sorted_importances = sorted(
    aggregated_importances.items(), key=lambda item: item[1], reverse=True
)

# 4. Create a DataFrame for plotting
importance_df = pd.DataFrame(sorted_importances, columns=['Feature', 'Importance'])

# Filter to show only the top 20 features
top_20_importance_df = importance_df.head(20)
In [ ]:
# 5. Create the bar plot
plt.figure(figsize=(12, 8))  # Adjust size as needed
plt.barh(top_20_importance_df['Feature'], top_20_importance_df['Importance'], color='dodgerblue')
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Feature')
plt.title('Top 20 Features Ranked by Mean Absolute SHAP Value')
plt.gca().invert_yaxis()  # Most important feature on top
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# For individual predictions (e.g., first sample)
shap.initjs()  # Initialize JavaScript visualization
single_sample_idx = 0
shap.force_plot(explainer.expected_value[1] if isinstance(shap_values, list) else explainer.expected_value,
                shap_values[single_sample_idx] if isinstance(shap_values, list) else shap_values[single_sample_idx,:],
                X_preprocessed[single_sample_idx],
                feature_names=feature_names)
No description has been provided for this image
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
###### Demonstration of SHAP feature importance for one individual case ###########

# For individual predictions (e.g., first sample)
single_sample_idx = 0
single_sample_shap_values = shap_values[single_sample_idx] if isinstance(shap_values, list) else shap_values[single_sample_idx, :]

# Aggregate SHAP values by base feature for the single sample
feature_importances = {}
for i, feature_name in enumerate(feature_names):
    base_feature = col.replace('num__', '')
  # Extract base feature name
    shap_value = single_sample_shap_values[i]
    feature_importances[base_feature] = feature_importances.get(base_feature, 0) + abs(shap_value)

# Sort features by importance
sorted_importances = sorted(feature_importances.items(), key=lambda item: item[1], reverse=True)

# Print the ranked feature importance for the single prediction
print(f"Overall Feature Importance for Sample {single_sample_idx}:")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance:.4f}")
Overall Feature Importance for Sample 0:
V2156: 3.5576
In [ ]:
# SHAP feature interaction

# Calculate SHAP interaction values
shap_interaction_values = explainer.shap_interaction_values(X_preprocessed)
In [ ]:
import numpy as np
import pandas as pd

def aggregate_shap_interactions(shap_interaction_values, feature_names, get_base_feature):
    """
    Aggregates pairwise SHAP interaction values back to their original (pre–one-hot) features.
    
    Parameters
    ----------
    shap_interaction_values : np.ndarray
        SHAP interaction values of shape [n_samples, n_features, n_features].
    feature_names : list of str
        The one-hot-encoded feature names corresponding to shap_interaction_values.
    get_base_feature : callable
        A function that takes a one-hot-encoded feature name and returns the base/original feature name.
    
    Returns
    -------
    pd.DataFrame
        DataFrame with ["Feature1", "Feature2", "InteractionValue", "AbsInteraction"] 
        sorted in descending order of AbsInteraction.
    """
    # 1. Aggregate across samples (e.g., mean absolute interactions)
    interaction_matrix = np.mean(np.abs(shap_interaction_values), axis=0)
    
    # 2. Map each OHE feature to a base feature
    base_feature_names = [get_base_feature(n) for n in feature_names]
    unique_base_features = list(set(base_feature_names))
    
    # 3. Build a structure to accumulate aggregated pairwise interactions
    aggregated_interactions = {
        bf_i: {bf_j: 0.0 for bf_j in unique_base_features}
        for bf_i in unique_base_features
    }

    n_features = len(feature_names)
    for i in range(n_features):
        for j in range(i+1, n_features): # i+1 => no diagonal, no duplicates
            bf_i = base_feature_names[i]
            bf_j = base_feature_names[j]
            aggregated_interactions[bf_i][bf_j] += interaction_matrix[i, j]
    
    # 4. Convert to DataFrame
    data_records = []
    for bf_i in unique_base_features:
        for bf_j in unique_base_features:
            # If you want to keep only i <= j, add a condition to avoid duplicates
            interaction_val = aggregated_interactions[bf_i][bf_j]
            data_records.append((bf_i, bf_j, interaction_val))
    
    df_interactions = pd.DataFrame(data_records, columns=["Feature1", "Feature2", "InteractionValue"])
    df_interactions["AbsInteraction"] = df_interactions["InteractionValue"].abs()
    
    # Sort descending by absolute interaction
    df_interactions.sort_values("AbsInteraction", ascending=False, inplace=True)
    df_interactions.reset_index(drop=True, inplace=True)
    df_interactions_no_diagonal = df_interactions[df_interactions['Feature1'] != df_interactions['Feature2']]
    return df_interactions_no_diagonal


# Example usage:
def simple_get_base_feature(name):
    # If it has the "num__" prefix, strip it off
    if name.startswith("num__"):
        name = name[len("num__"):]  # "V2178"
    # If it has the "cat__" prefix, strip that as well
    if name.startswith("cat__"):
        name = name[len("cat__"):] 
    # Now 'name' might look like "V2178"
    # Just return it as the base feature
    return name

df_agg_interactions = aggregate_shap_interactions(
    shap_interaction_values=shap_interaction_values,
    feature_names=feature_names,
    get_base_feature=simple_get_base_feature
)

# Print the top 20 interactions
print(df_agg_interactions.head(20))

# --- Pick Top 20 Interactions ---
df_top_20 = df_agg_interactions.head(20).copy()

# Create a convenient label for each pair
df_top_20["Pair"] = df_top_20["Feature1"] + " & " + df_top_20["Feature2"]

# --- Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_top_20, 
    y="Pair", 
    x="AbsInteraction", 
    color="royalblue"
)
plt.title("Top 20 Pairwise Feature Interactions by Absolute SHAP Value")
plt.xlabel("Absolute SHAP Interaction Value")
plt.ylabel("Feature Pair")
plt.tight_layout()
plt.show()
   Feature1 Feature2  InteractionValue  AbsInteraction
0      wave    V2116          0.381856        0.381856
1      wave    V2105          0.305814        0.305814
2      wave    V2101          0.181126        0.181126
3      wave    V2196          0.057863        0.057863
4      wave     race          0.055804        0.055804
5     V2116    V2105          0.049386        0.049386
6     V2194     wave          0.040912        0.040912
7      wave    V2108          0.035271        0.035271
8     V2166     wave          0.033239        0.033239
9     V2116    V2101          0.033163        0.033163
10     wave    V2191          0.028146        0.028146
11     wave      sex          0.022875        0.022875
12     wave    V2195          0.022266        0.022266
13     wave    V2176          0.020286        0.020286
14    V2116     race          0.017869        0.017869
15    V2101    V2105          0.017857        0.017857
16    V2201     wave          0.017011        0.017011
17    V2196    V2105          0.016886        0.016886
18     wave    V2152          0.015859        0.015859
19     wave      V13          0.015846        0.015846
No description has been provided for this image
In [ ]:
# Interaction of the top 2 features
top_features_indices = np.argsort(np.abs(shap_values).mean(0))[-2:]  # Get indices of top 2 features
feature1_idx = top_features_indices[0]
feature2_idx = top_features_indices[1]
feature1_name = feature_names[feature1_idx]
feature2_name = feature_names[feature2_idx]
In [ ]:
# Visualize the interaction between the top two features
shap.dependence_plot(
    feature1_idx,
    shap_values,
    X_preprocessed,
    feature_names=feature_names,
    interaction_index=feature2_idx,
)

shap.dependence_plot(
    feature2_idx,
    shap_values,
    X_preprocessed,
    feature_names=feature_names,
    interaction_index=feature1_idx,
)
No description has been provided for this image
No description has been provided for this image
In [ ]:
# 1. Group feature indices by base feature (removing "num__" prefix).
base_feature_indices = {}
for i, feature_name in enumerate(feature_names):
    # Remove "num__" so each feature remains distinct
    base_feature = feature_name.replace("num__", "")
    if base_feature not in base_feature_indices:
        base_feature_indices[base_feature] = []
    base_feature_indices[base_feature].append(i)

base_feature_list = list(base_feature_indices.keys())

# 2. Calculate overall interaction importance for each pair of base features
#    without duplicating reversed pairs (bf1,bf2) vs (bf2,bf1).
base_feature_interaction_importance = {}

for bf1_idx in range(len(base_feature_list)):
    for bf2_idx in range(bf1_idx + 1, len(base_feature_list)):
        bf1 = base_feature_list[bf1_idx]
        bf2 = base_feature_list[bf2_idx]

        # Sum up the pairwise interactions between *all* sub-indices of bf1 and bf2.
        interaction_sum = 0.0
        for i in base_feature_indices[bf1]:
            for j in base_feature_indices[bf2]:
                if isinstance(shap_interaction_values, list):
                    # e.g., for multiclass or ensemble, focusing on shap_interaction_values[0]
                    interaction_sum += shap_interaction_values[0][0, i, j]
                    # If you want both i->j and j->i, add shap_interaction_values[0][0, j, i]
                    # but usually shap_interaction_values[i,j] == shap_interaction_values[j,i].
                else:
                    interaction_sum += shap_interaction_values[0, i, j]
                    # Same note here if you want both directions.

        # Use absolute value as "importance"
        pair_key = (bf1, bf2)  # We already enforce bf1_idx < bf2_idx
        base_feature_interaction_importance[pair_key] = abs(interaction_sum)

# 3. Sort base feature interactions by importance
sorted_base_feature_interactions = sorted(
    base_feature_interaction_importance.items(),
    key=lambda item: item[1],
    reverse=True
)

# 4. Print the top 50 ranked base feature interactions (no (bf, bf), no reversed duplicates)
print("Top 50 Overall Base Feature Interaction Importance (Skipping self-interactions & duplicates):")
for (bf1, bf2), importance in sorted_base_feature_interactions[:50]:
    print(f"Interaction between {bf1} and {bf2}: {importance:.4f}")
Top 50 Overall Base Feature Interaction Importance (Skipping self-interactions & duplicates):
Interaction between wave and V2105: 0.4028
Interaction between wave and V2116: 0.3704
Interaction between wave and V2101: 0.1144
Interaction between V2201 and wave: 0.1128
Interaction between V2166 and wave: 0.0705
Interaction between wave and V2196: 0.0540
Interaction between wave and race: 0.0299
Interaction between V2184 and wave: 0.0246
Interaction between V2116 and race: 0.0215
Interaction between wave and V2195: 0.0202
Interaction between wave and sex: 0.0202
Interaction between wave and V2176: 0.0186
Interaction between V2194 and wave: 0.0180
Interaction between wave and V2108: 0.0171
Interaction between wave and V2191: 0.0163
Interaction between V2166 and V2116: 0.0128
Interaction between race and V2105: 0.0127
Interaction between V2184 and V2175: 0.0121
Interaction between V2175 and sex: 0.0117
Interaction between wave and V2152: 0.0094
Interaction between V2116 and V2105: 0.0090
Interaction between V2196 and V2105: 0.0089
Interaction between V2201 and sex: 0.0083
Interaction between V2201 and V2105: 0.0081
Interaction between V2166 and V2157: 0.0078
Interaction between V2201 and V2196: 0.0073
Interaction between sex and V2105: 0.0072
Interaction between V2184 and V2157: 0.0072
Interaction between V2175 and V2191: 0.0070
Interaction between V2201 and V2166: 0.0068
Interaction between V2194 and V2105: 0.0068
Interaction between V2116 and V2101: 0.0067
Interaction between wave and V2177: 0.0067
Interaction between V2166 and V2105: 0.0067
Interaction between V2194 and V2166: 0.0064
Interaction between V2201 and V2101: 0.0061
Interaction between wave and V13: 0.0061
Interaction between V2116 and V2152: 0.0058
Interaction between V2197 and wave: 0.0058
Interaction between V2175 and V2105: 0.0056
Interaction between V2116 and sex: 0.0056
Interaction between wave and V2193: 0.0056
Interaction between V2194 and V2175: 0.0054
Interaction between V2176 and sex: 0.0050
Interaction between V2195 and V2105: 0.0048
Interaction between V2166 and sex: 0.0048
Interaction between sex and V2152: 0.0047
Interaction between V2173 and wave: 0.0047
Interaction between V2176 and V2105: 0.0045
Interaction between V2166 and V2152: 0.0043
In [ ]:
# 3. Create a matrix for the heatmap
num_base_features = len(base_feature_names)
interaction_matrix = np.zeros((num_base_features, num_base_features))

for i, bf1 in enumerate(base_feature_names):
    for j, bf2 in enumerate(base_feature_names):
        # Use the sorted tuple for lookup
        key = tuple(sorted((bf1, bf2)))
        if key in base_feature_interaction_importance:
            interaction_matrix[i, j] = base_feature_interaction_importance[key]
        #else:
        #    print(f"Warning: Interaction not found for {bf1}, {bf2}") # Optional debugging

# 4. Visualize the aggregated interaction using a heatmap
plt.figure(figsize=(30, 28))
plt.imshow(interaction_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar(label='Aggregated Interaction Strength')
plt.xticks(range(num_base_features), base_feature_names, rotation=45, ha="right")
plt.yticks(range(num_base_features), base_feature_names)
plt.title('Aggregated SHAP Interaction Between Base Features')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Interaction Summary Plot (for overall interaction strengths)
shap.summary_plot(shap_interaction_values, X_preprocessed, feature_names=feature_names)
No description has been provided for this image

Histogram-based Gradient Boost Classifier¶

In [ ]:
import logging
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer

# Random State for reproducibility
RANDOM_STATE = 42

# Use RepeatedStratifiedKFold for more robust validation
N_SPLITS_CV = 5
N_REPEATS = 2  # Repeat the CV multiple times if desired
SCORING_METRIC = 'roc_auc'
VERBOSE = 1

logging.info("\n--- Gradient Boosting (Revised) ---")

# Define the transformer:
from sklearn.preprocessing import FunctionTransformer

def to_dense_func(X):
    """Convert sparse matrices to dense arrays (if needed)."""
    return X.toarray() if hasattr(X, 'toarray') else X

to_dense = FunctionTransformer(to_dense_func)

# Build pipeline
gbc_pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('to_dense', to_dense),
    ('classifier', HistGradientBoostingClassifier(random_state=RANDOM_STATE))
])

# Expanded parameter distributions for RandomizedSearch
# Optimized parameter grid for 24-core/100GB RAM
param_grid = {
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],  # Wider range including very low rates
    'classifier__max_depth': [6, 12, 18],                 # Deeper trees with more variation
    'classifier__min_samples_leaf': [20, 50, 100],        # More granular leaf sizes
    'classifier__l2_regularization': [0.0, 0.1, 0.5, 1.0],# Stronger regularization options
    'classifier__max_bins': [255],                        # Keep max bins for accuracy
    'classifier__max_leaf_nodes': [64, 128, 256],         # Control tree complexity
    'classifier__max_iter': [2000],                       # Let early stopping handle actual iterations
}

try:
    logging.info("Starting randomized search for Gradient Boosting...")

    # Use RepeatedStratifiedKFold without shuffle
    cv_gbc = RepeatedStratifiedKFold(
        n_splits=N_SPLITS_CV, 
        n_repeats=N_REPEATS, 
        random_state=RANDOM_STATE
    )

    # RandomizedSearchCV to cover more combinations within reasonable compute time
    gbc_random_search = RandomizedSearchCV(
        estimator=gbc_pipeline,
        param_distributions=param_grid,
        n_iter=50,  # Increase or decrease based on resources
        cv=cv_gbc,
        scoring=SCORING_METRIC,
        n_jobs=24,  # Use all available cores
        random_state=RANDOM_STATE,
        verbose=VERBOSE
    )

    # Fit the RandomizedSearchCV
    gbc_random_search.fit(X_train_with_indicators, y_train)

    logging.info(f"Best parameters (GBC): {gbc_random_search.best_params_}")
    logging.info(f"Best cross-validation {SCORING_METRIC}: {gbc_random_search.best_score_:.4f}")

    # Extract the best estimator
    best_gbc = gbc_random_search.best_estimator_

except Exception as e:
    logging.error(f"An error occurred during Gradient Boosting randomized search: {e}")
    raise

# Evaluate the best Gradient Boosting model
try:
    best_gbc.fit(X_train_with_indicators, y_train)
    y_pred_gbc = best_gbc.predict(X_test_with_indicators)
    y_pred_proba_gbc = best_gbc.predict_proba(X_test_with_indicators)[:, 1]

    logging.info("=== Best Gradient Boosting Evaluation ===")
    logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred_gbc)))
    logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred_gbc)))
    logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_gbc):.4f}")

    # Plot ROC Curve
    fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_pred_proba_gbc)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_gbc, tpr_gbc, label=f'AUC = {roc_auc_score(y_test, y_pred_proba_gbc):.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Gradient Boosting ROC Curve on Test Data')
    plt.legend(loc='lower right')
    plt.show()

except Exception as e:
    logging.error(f"An error occurred during Gradient Boosting training/evaluation: {e}")
    raise

logging.info("Script completed successfully.")
2025-02-14 17:06:17,375 - INFO - 
--- Gradient Boosting (Revised) ---
2025-02-14 17:06:17,376 - INFO - Starting randomized search for Gradient Boosting...
Fitting 10 folds for each of 50 candidates, totalling 500 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
2025-02-14 17:15:26,165 - INFO - Best parameters (GBC): {'classifier__min_samples_leaf': 50, 'classifier__max_leaf_nodes': 64, 'classifier__max_iter': 2000, 'classifier__max_depth': 18, 'classifier__max_bins': 255, 'classifier__learning_rate': 0.05, 'classifier__l2_regularization': 0.1}
2025-02-14 17:15:26,166 - INFO - Best cross-validation roc_auc: 0.9098
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
2025-02-14 17:15:31,141 - INFO - === Best Gradient Boosting Evaluation ===
2025-02-14 17:15:31,148 - INFO - Confusion Matrix:
[[2369  560]
 [ 571 3046]]
2025-02-14 17:15:31,161 - INFO - 
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81      2929
         1.0       0.84      0.84      0.84      3617

    accuracy                           0.83      6546
   macro avg       0.83      0.83      0.83      6546
weighted avg       0.83      0.83      0.83      6546

2025-02-14 17:15:31,165 - INFO - ROC AUC: 0.9091
No description has been provided for this image
2025-02-14 17:15:31,421 - INFO - Script completed successfully.
In [ ]:
# Define the model file path
model_filename = os.path.expanduser('~/work/vaping_project_data/best_hgbt_model.joblib')

# Save the trained model
joblib.dump(best_gbc, model_filename)
logging.info(f"Model saved to {model_filename}")
2025-02-14 17:16:56,234 - INFO - Model saved to /storage/home/szn5432/work/vaping_project_data/best_hgbt_model.joblib
In [ ]:
# Load the model (when needed)
file_path = os.path.expanduser('~/work/vaping_project_data/best_hgbt_model.joblib')
loaded_hgbt = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance on RAW DATA (let pipeline handle preprocessing)
result = permutation_importance(
    loaded_hgbt,  # This is your full pipeline
    X_test_with_indicators,  # Raw data with missing indicators
    y_test,
    n_repeats=5,
    random_state=RANDOM_STATE,
    n_jobs=CPU_COUNT
)

# Get feature names from the raw data (including missing indicators)
feature_names = X_test_with_indicators.columns.tolist()

# Create importance DataFrame
perm_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': result.importances_mean
}).sort_values('Importance', ascending=False)

# Plot top 20
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=perm_importance.head(20))
plt.title("Top 20 Features by Permutation Importance (Raw Features)")
plt.show()
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
No description has been provided for this image
In [ ]:
# Create a table for top 20 feature importances
top_20_features = perm_importance.head(20)

# Display the table
print("Top 20 Feature Importances:")
display(top_20_features.style.background_gradient(cmap='Blues', subset=['Importance']))
Top 20 Feature Importances:
  Feature Importance
11 wave 0.315643
15 V2116 0.065383
27 V2101 0.031103
42 V2105 0.025023
19 race 0.015826
26 V2108 0.004980
7 V2201 0.001833
17 V2182 0.001253
14 V2177 0.001130
12 V2176 0.001130
18 sex 0.000978
33 V2196 0.000886
24 V2163 0.000886
2 V2197 0.000825
25 V49 0.000794
28 V2180 0.000764
6 V2128 0.000733
9 V2194 0.000733
10 V2166 0.000733
49 V2156 0.000703
In [ ]:
##### SHAP Feature Importance ####

# Extract the GradientBoostingClassifier
hgbt_models = loaded_hgbt.named_steps['classifier']
# Get preprocessed features
X_preprocessed = loaded_hgbt.named_steps['preprocessor'].transform(X_test_with_indicators)
# Convert sparse matrix to DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed.toarray())
# Create SHAP explainer
explainer = shap.TreeExplainer(hgbt_models)
# Calculate SHAP values
shap_values = explainer.shap_values(X_preprocessed)
# Get feature names after preprocessing
feature_names = loaded_hgbt.named_steps['preprocessor'].get_feature_names_out()
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
In [ ]:
# 1. Aggregate SHAP values by base feature
feature_importances = {}
for i, col in enumerate(feature_names):
    base_feature = '_'.join(col.split('_')[:-1])  # Extract base feature name
    if base_feature not in feature_importances:
        feature_importances[base_feature] = []
    feature_importances[base_feature].extend(np.abs(shap_values[:, i]))

# 2. Calculate mean absolute SHAP value for each base feature
aggregated_importances = {
    feature: np.mean(values) for feature, values in feature_importances.items()
}

# 3. Sort features by importance
sorted_importances = sorted(
    aggregated_importances.items(), key=lambda item: item[1], reverse=True
)

# 4. Create a DataFrame for plotting
importance_df = pd.DataFrame(sorted_importances, columns=['Feature', 'Importance'])

# Filter to show only the top 20 features
top_20_importance_df = importance_df.head(20)

# 5. Create the bar plot
plt.figure(figsize=(12, 8))  # Adjust size as needed
plt.barh(top_20_importance_df['Feature'], top_20_importance_df['Importance'], color='dodgerblue')
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Feature')
plt.title('Top 20 Features Ranked by Mean Absolute SHAP Value')
plt.gca().invert_yaxis()  # Most important feature on top
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
import numpy as np

# Suppose your X_preprocessed is shape (n_samples, n_features).
# Randomly sample e.g. 300 rows:
sample_size = 300
if X_preprocessed.shape[0] > sample_size:
    rnd_idx = np.random.choice(X_preprocessed.shape[0], sample_size, replace=False)
    X_sampled = X_preprocessed.iloc[rnd_idx]

else:
    X_sampled = X_preprocessed

# Now compute interaction values on this smaller subset
shap_interaction_values = explainer.shap_interaction_values(X_sampled)
In [ ]:
import numpy as np
import pandas as pd

def aggregate_shap_interactions(shap_interaction_values, feature_names, get_base_feature):
    """
    Aggregates pairwise SHAP interaction values back to their original (pre–one-hot) features.
    
    Parameters
    ----------
    shap_interaction_values : np.ndarray
        SHAP interaction values of shape [n_samples, n_features, n_features].
    feature_names : list of str
        The one-hot-encoded feature names corresponding to shap_interaction_values.
    get_base_feature : callable
        A function that takes a one-hot-encoded feature name and returns the base/original feature name.
    
    Returns
    -------
    pd.DataFrame
        DataFrame with ["Feature1", "Feature2", "InteractionValue", "AbsInteraction"] 
        sorted in descending order of AbsInteraction.
    """
    # 1. Aggregate across samples (e.g., mean absolute interactions)
    interaction_matrix = np.mean(np.abs(shap_interaction_values), axis=0)
    
    # 2. Map each OHE feature to a base feature
    base_feature_names = [get_base_feature(n) for n in feature_names]
    unique_base_features = list(set(base_feature_names))
    
    # 3. Build a structure to accumulate aggregated pairwise interactions
    aggregated_interactions = {
        bf_i: {bf_j: 0.0 for bf_j in unique_base_features}
        for bf_i in unique_base_features
    }

    n_features = len(feature_names)
    for i in range(n_features):
        for j in range(i+1, n_features): # i+1 => no diagonal, no duplicates
            bf_i = base_feature_names[i]
            bf_j = base_feature_names[j]
            aggregated_interactions[bf_i][bf_j] += interaction_matrix[i, j]
    
    # 4. Convert to DataFrame
    data_records = []
    for bf_i in unique_base_features:
        for bf_j in unique_base_features:
            # If you want to keep only i <= j, add a condition to avoid duplicates
            interaction_val = aggregated_interactions[bf_i][bf_j]
            data_records.append((bf_i, bf_j, interaction_val))
    
    df_interactions = pd.DataFrame(data_records, columns=["Feature1", "Feature2", "InteractionValue"])
    df_interactions["AbsInteraction"] = df_interactions["InteractionValue"].abs()
    
    # Sort descending by absolute interaction
    df_interactions.sort_values("AbsInteraction", ascending=False, inplace=True)
    df_interactions.reset_index(drop=True, inplace=True)
    df_interactions_no_diagonal = df_interactions[df_interactions['Feature1'] != df_interactions['Feature2']]
    return df_interactions_no_diagonal


# Example usage:
def simple_get_base_feature(name):
    # Remove the cat__ prefix if present
    if name.startswith("cat__"):
        name = name[len("cat__"):]
    # Then split on the first underscore only
    return name.split("_", 1)[0]

df_agg_interactions = aggregate_shap_interactions(
    shap_interaction_values=shap_interaction_values,
    feature_names=feature_names,
    get_base_feature=simple_get_base_feature
)

# Print the top 20 interactions
print(df_agg_interactions.head(20))

# --- Pick Top 20 Interactions ---
df_top_20 = df_agg_interactions.head(20).copy()

# Create a convenient label for each pair
df_top_20["Pair"] = df_top_20["Feature1"] + " & " + df_top_20["Feature2"]

# --- Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_top_20, 
    y="Pair", 
    x="AbsInteraction", 
    color="royalblue"
)
plt.title("Top 20 Pairwise Feature Interactions by Absolute SHAP Value")
plt.xlabel("Absolute SHAP Interaction Value")
plt.ylabel("Feature Pair")
plt.tight_layout()
plt.show()
   Feature1 Feature2  InteractionValue  AbsInteraction
1      wave    V2116          0.512344        0.512344
2      wave    V2105          0.453857        0.453857
3      wave    V2101          0.280974        0.280974
4      wave     race          0.173988        0.173988
5      wave    V2196          0.081804        0.081804
6      wave    V2108          0.077313        0.077313
7     V2166     wave          0.053517        0.053517
8     V2194     wave          0.050455        0.050455
9      wave      sex          0.049956        0.049956
10     wave      V13          0.041606        0.041606
11    V2116    V2105          0.038339        0.038339
12    V2116    V2101          0.035215        0.035215
13     wave    V2191          0.033705        0.033705
14     wave    V2152          0.033623        0.033623
15     wave    V2179          0.033035        0.033035
17    V2201     wave          0.031867        0.031867
18    V2116     race          0.030408        0.030408
19     wave    V2164          0.026907        0.026907
20     wave    V2195          0.026834        0.026834
21    V2173     wave          0.025816        0.025816
No description has been provided for this image

XGBOOST¶

In [ ]:
import os
import logging
import joblib
import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# ======================
# 1. Expanded Hyperparameter Grid
# ======================
param_dist = {
    # Increase n_estimators up to 1000 (or more)
    'classifier__n_estimators': [100, 300, 500, 800, 1000],
    
    # Smaller learning rates for finer updates
    'classifier__learning_rate': [0.01, 0.02, 0.03, 0.05, 0.1],
    
    # Broader range for max_depth
    'classifier__max_depth': [3, 5, 7, 9, 12],
    
    # Tweak min_child_weight to control complexity
    'classifier__min_child_weight': [1, 3, 5, 7, 10],
    
    # Keep or enlarge subsample
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    
    # Potentially try colsample_bytree > 1.0 (uncommon, but possible)
    'classifier__colsample_bytree': [0.6, 0.8, 1.0, 1.2],
    
    # Adjust gamma (larger => more conservative splits)
    'classifier__gamma': [0, 0.1, 0.3, 0.6, 1.0],
    
    # Expand regularization range
    'classifier__reg_alpha': [0, 0.1, 0.2, 0.5, 1.0, 2.0],
    'classifier__reg_lambda': [0.5, 1.0, 1.5, 2.0, 3.0, 4.0]
}

# ============================
# 2. OPTIONAL FEATURE ENGINEERING
# ============================
# Example: add a custom transformer to create domain-specific or interaction features
# (Here, we just pass data through, but you'd modify 'feature_engineering' to transform the input DataFrame.)

def feature_engineering(X):
    """
    Placeholder function where you can create domain-specific,
    polynomial, or ratio features. This must return a DataFrame or array.
    """
    # Example: create a simple ratio of two columns (if they exist)
    # if 'colA' in X and 'colB' in X:
    #     X['ratio_A_B'] = X['colA'] / (X['colB'] + 1e-9)
    
    return X

feature_eng_transformer = FunctionTransformer(feature_engineering, validate=False)

# ============================
# 3. Build Pipeline
# ============================
# We assume:
#   - You already have 'preprocessor' for OneHotEncoder, etc.
#   - 'X_train_with_indicators', 'y_train', 'X_test_with_indicators', 'y_test'
#   - 'train_evaluate_model(...)' function
#   - Constants: RANDOM_STATE, SCORING_METRIC, N_SPLITS_CV

xgb_clf = xgb.XGBClassifier(
    eval_metric='logloss',
    random_state=RANDOM_STATE
)

# Here we insert a feature engineering step *before* the preprocessor:
xgb_pipeline = Pipeline([
    ('feature_engineering', feature_eng_transformer),
    ('preprocessor', preprocessor),
    ('classifier', xgb_clf)
])

# =========================
# 4. (Optional) Early Stopping
# =========================
# In scikit-learn's RandomizedSearchCV, providing early_stopping_rounds is non-trivial because
# you need a separate validation set or a custom approach inside each CV fold.
# If you wish to do a simple holdout for early stopping, you'd do something like:
#
# fit_params = {
#    'classifier__early_stopping_rounds': 30,
#    'classifier__eval_metric': 'logloss',
#    'classifier__eval_set': [(X_val, y_val)],  # separate validation set
# }
#
# Then pass fit_params=fit_params to random_search.fit(...).
# This is more advanced; we’ll skip it here for brevity.

# ============================
# 5. Randomized Search Setup
# ============================
cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_STATE)

random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist,
    n_iter=50,  # Increase if you have time/resources
    cv=cv,
    scoring=SCORING_METRIC,
    n_jobs=-1,
    verbose=1,
    random_state=RANDOM_STATE
)

logging.info("Starting RandomizedSearchCV for expanded XGBoost grid...")
random_search.fit(X_train_with_indicators, y_train)
logging.info("RandomizedSearchCV complete.")

best_xgb_model = random_search.best_estimator_
logging.info(f"Best parameters: {random_search.best_params_}")
logging.info(f"Best CV {SCORING_METRIC}: {random_search.best_score_:.4f}")

# ============================
# 6. Evaluate & Save Best Model
# ============================
trained_best_xgb = train_evaluate_model(
    model=best_xgb_model,
    X_train=X_train_with_indicators,
    y_train=y_train,
    X_test=X_test_with_indicators,
    y_test=y_test,
    model_name="Tuned XGBoost Model"
)

model_filename = os.path.expanduser('~/work/vaping_project_data/best_xgb_model.joblib')
joblib.dump(trained_best_xgb, model_filename)
logging.info(f"Final XGBoost model saved to: {model_filename}")
2025-02-15 10:33:01,124 - INFO - Starting RandomizedSearchCV for expanded XGBoost grid...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/model_selection/_validation.py:528: FitFailedWarning: 
75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
                    ^^^^^^
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/training.py", line 181, in train
    bst.update(dtrain, iteration=i, fobj=obj)
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 2100, in update
    _check_call(
  File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 284, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: value 1.2 for Parameter colsample_bytree exceed bound [0,1]
colsample_bytree: Subsample ratio of columns, resample on each tree construction.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/model_selection/_search.py:1108: UserWarning: One or more of the test scores are non-finite: [       nan        nan        nan 0.90200238 0.90706627        nan
 0.89999718        nan 0.89720182 0.85991863 0.87354831        nan
 0.896498   0.89432499 0.91069245        nan 0.90742226        nan
 0.86254671 0.89445416 0.82215935 0.83872734 0.89939779 0.9083536
 0.89200625 0.90937551 0.90646569        nan 0.89624894 0.90608667
 0.90722221 0.90915747 0.89949231 0.90255506 0.87744443        nan
 0.90801846 0.88299783 0.84824171 0.89819916        nan 0.88255418
 0.90983895 0.90093799        nan 0.90967376        nan        nan
        nan 0.90768914]
  warnings.warn(
2025-02-15 10:35:58,854 - INFO - RandomizedSearchCV complete.
2025-02-15 10:35:58,856 - INFO - Best parameters: {'classifier__subsample': 0.6, 'classifier__reg_lambda': 4.0, 'classifier__reg_alpha': 0.5, 'classifier__n_estimators': 800, 'classifier__min_child_weight': 5, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.01, 'classifier__gamma': 0.3, 'classifier__colsample_bytree': 1.0}
2025-02-15 10:35:58,857 - INFO - Best CV roc_auc: 0.9107
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
2025-02-15 10:38:07,097 - INFO - === Tuned XGBoost Model Evaluation ===
2025-02-15 10:38:07,103 - INFO - Confusion Matrix:
[[2384  545]
 [ 598 3019]]
2025-02-15 10:38:07,116 - INFO - 
Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.81      0.81      2929
         1.0       0.85      0.83      0.84      3617

    accuracy                           0.83      6546
   macro avg       0.82      0.82      0.82      6546
weighted avg       0.83      0.83      0.83      6546

2025-02-15 10:38:07,120 - INFO - ROC AUC: 0.9101
No description has been provided for this image
2025-02-15 10:38:07,350 - INFO - Final XGBoost model saved to: /storage/home/szn5432/work/vaping_project_data/best_xgb_model.joblib
In [ ]:
# Load the model (when needed)
def feature_engineering(X):
    """
    Placeholder function where you can create domain-specific,
    polynomial, or ratio features. This must return a DataFrame or array.
    """
    # Example: create a simple ratio of two columns (if they exist)
    # if 'colA' in X and 'colB' in X:
    #     X['ratio_A_B'] = X['colA'] / (X['colB'] + 1e-9)
    
    return X

file_path = os.path.expanduser('~/work/vaping_project_data/best_xgb_model.joblib')
loaded_xgb = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
# Function to plot feature importances
def plot_feature_importance(loaded_xgb, feature_names, top_n=20, title="Feature Importance"):
    """
    Plots the top N feature importances from a trained model.
    """
    if hasattr(loaded_xgb, 'feature_importances_'):
        importances = loaded_xgb.feature_importances_
    elif hasattr(loaded_xgb, 'named_steps') and 'classifier' in loaded_xgb.named_steps:
        if hasattr(loaded_xgb.named_steps['classifier'], 'feature_importances_'):
            importances = loaded_xgb.named_steps['classifier'].feature_importances_
        else:
            raise ValueError("Classifier does not have feature_importances_ attribute.")
    else:
        raise ValueError("Provided model does not have feature_importances_ attribute.")

    fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    fi_df = fi_df.sort_values('Importance', ascending=False).head(top_n)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=fi_df)
    plt.title(title)
    plt.tight_layout()
    plt.show()

# Get the feature names from the preprocessor
feature_names = loaded_xgb.named_steps['preprocessor'].get_feature_names_out()

# Plot the top 20 most important features
plot_feature_importance(loaded_xgb, feature_names, top_n=20, title="Top 20 Most Important Features")
No description has been provided for this image
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_aggregated_feature_importance(model, top_n=20, title="Top 20 Aggregated Feature Importance"):
    """
    Plots the top N aggregated feature importances for a CatBoost model
    that is wrapped inside a Pipeline with a ColumnTransformer.
    
    Parameters
    ----------
    model : Pipeline
        A scikit-learn Pipeline that includes:
          - 'preprocessor': a ColumnTransformer or other transformer
          - 'classifier': a CatBoostClassifier
    top_n : int, optional (default=20)
        How many top aggregated features to display.
    title : str, optional
        Title of the plot.
    """
    # 1. Get the feature names after the preprocessor step
    feature_names = loaded_xgb.named_steps['preprocessor'].get_feature_names_out()
    
    # 2. Get the feature importances from CatBoost
    xgboost_estimator = loaded_xgb.named_steps['classifier']
    if not hasattr(xgboost_estimator, 'feature_importances_'):
        raise AttributeError("The CatBoost classifier does not expose 'feature_importances_'.")

    importances = xgboost_estimator.feature_importances_

    # 3. Aggregate importances by original feature
    aggregated_importance = {}
    
    for name, imp in zip(feature_names, importances):
        # Example naming conventions after ColumnTransformer + OneHotEncoder:
        #   "onehotencoder__Gender_Male"
        #   "remainder__Age"
        # Adjust this parsing logic as necessary for your pipeline.
        
        if "__" in name:
            # Split on the double underscore to separate the transformer name vs. the actual column
            parts = name.split("__", maxsplit=1)
            # parts[0] might be 'onehotencoder' or 'remainder'
            # parts[1] might be 'Gender_Male' or 'Age'
            # We'll then split again on '_' if needed to get just the original column name
            col_part = parts[1]
            
            # If the column was numeric (remainder), it may be simply 'Age'.
            # If the column was OHE, it might be 'Gender_Male' or 'Gender_Female'.
            # A simple approach is to take everything before the first underscore as the feature name:
            if "_" in col_part:
                original_feature = col_part.split("_", maxsplit=1)[0]
            else:
                original_feature = col_part
        else:
            # If there's no double underscore, assume the whole name is the feature
            original_feature = name
        
        # Sum up the importances
        aggregated_importance[original_feature] = aggregated_importance.get(original_feature, 0.0) + imp

    # 4. Make a DataFrame of aggregated importances and sort
    agg_df = pd.DataFrame(list(aggregated_importance.items()), columns=["Feature", "Importance"])
    agg_df = agg_df.sort_values("Importance", ascending=False)

    # 5. Plot top N aggregated feature importances
    plt.figure(figsize=(10, 6))
    sns.barplot(x="Importance", y="Feature", data=agg_df.head(top_n))
    plt.title(title)
    plt.tight_layout()
    plt.show()


# --- Usage Example ---
# Assuming you have your best_model pipeline (with 'preprocessor' and CatBoost 'classifier'):

plot_aggregated_feature_importance(loaded_xgb, top_n=20, title="Top 20 Aggregated Feature Importance")
No description has been provided for this image
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay


def get_top_aggregated_features(loaded_xgb, top_n=10):
    """
    Returns a list of top_n original (aggregated) feature names
    based on the CatBoost feature importances in a Pipeline.
    
    Parameters
    ----------
    model : Pipeline
        A scikit-learn Pipeline with steps:
          - "preprocessor" : ColumnTransformer (or similar)
          - "classifier" : CatBoostClassifier
    top_n : int
        Number of top features to return
        
    Returns
    -------
    list of str
        Top N aggregated feature names
    """
    # Extract feature names that come out of the preprocessor
    feature_names = loaded_xgb.named_steps['preprocessor'].get_feature_names_out()
    
    # Extract importances from CatBoost
    xgboost_estimator = loaded_xgb.named_steps['classifier']
    importances = xgboost_estimator.feature_importances_

    # Aggregate importances by the original (pre-encoding) feature name
    aggregated_importance = {}
    for name, imp in zip(feature_names, importances):
        if "__" in name:
            # Example: "onehotencoder__Gender_Male" -> original_feature = "Gender"
            parts = name.split("__", maxsplit=1)
            col_part = parts[1]
            if "_" in col_part: 
                # For OHE columns like "Gender_Male"
                original_feature = col_part.split("_", maxsplit=1)[0]
            else:
                # For remainder numeric columns
                original_feature = col_part
        else:
            # If no __, assume name is the feature
            original_feature = name
        
        aggregated_importance[original_feature] = (
            aggregated_importance.get(original_feature, 0.0) + imp
        )

    # Turn into a DataFrame, sort, and get top_n
    agg_df = pd.DataFrame(
        list(aggregated_importance.items()), 
        columns=["Feature", "Importance"]
    ).sort_values("Importance", ascending=False)

    return agg_df.head(top_n)["Feature"].tolist()

# 1. Get the top 10 features by aggregated importance
top_features = get_top_aggregated_features(loaded_xgb, top_n=10)
print("Top 10 aggregated features:\n", top_features)

# Plot the partial dependence plot
for feat in top_features:
    if feat not in X_train_with_indicators.columns:
        print(f"Skipping feature '{feat}' as it is not found in the DataFrame.")
        continue

    fig, ax = plt.subplots(figsize=(6, 4))
    PartialDependenceDisplay.from_estimator(
        estimator=loaded_xgb,
        X=X_train_with_indicators,
        features=[feat],
        kind='average',
        grid_resolution=50,
        target=1,  # positive class for binary classification
        ax=ax
    )
    plt.title(f"Partial Dependence of {feat}")
    plt.show()
Top 10 aggregated features:
 ['wave', 'V2116', 'V2101', 'V2105', 'V2108', 'V2193', 'V2152', 'V2191', 'V2176', 'V2179']
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Add this import at the top of your script
from scipy import sparse

# --- SHAP Feature Importance ---
# Suppose your ColumnTransformer has the name "cat" for the OneHotEncoder step
# and you pass in categorical_features as the input_features:
encoded_feature_names = (
    preprocessor
    .named_transformers_['cat']  # "cat" is the name of the OHE step in ColumnTransformer
    .get_feature_names_out(input_features=categorical_features)
)


# Extract components from the pipeline
preprocessor = loaded_xgb.named_steps['preprocessor']
classifier = loaded_xgb.named_steps['classifier']

# Process the data through the pipeline
X_processed = preprocessor.transform(X_train_with_indicators)

# Convert sparse matrix to dense if needed
if isinstance(X_processed, (sparse.csr_matrix, sparse.csc_matrix)):
    X_processed = X_processed.toarray()

# Create a SHAP explainer
explainer = shap.TreeExplainer(classifier)

# Calculate SHAP values (using a sample for faster computation)
sample_idx = np.random.choice(X_processed.shape[0], 100, replace=False)
shap_values = explainer.shap_values(X_processed[sample_idx])

# Get feature names from the preprocessor
feature_names = encoded_feature_names  # From your existing code
In [ ]:
# Summary plot (feature importance)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, 
                 X_processed[sample_idx], 
                 feature_names=feature_names,
                 plot_type="bar",
                 show=False)
plt.title("SHAP Feature Importance (Mean Absolute Impact)")
plt.tight_layout()
plt.show()

# Detailed summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, 
                 X_processed[sample_idx], 
                 feature_names=feature_names,
                 show=False)
plt.title("SHAP Value Distribution")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Force plot for average prediction
plt.figure()
shap.force_plot(explainer.expected_value, 
                shap_values[0], 
                X_processed[0], 
                feature_names=feature_names,
                matplotlib=True,
                show=False)
plt.title("SHAP Force Plot for First Sample")
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
No description has been provided for this image
In [ ]:
# Extract original categorical features from one-hot encoded column names
import re

# Get the list of one-hot encoded column names
one_hot_encoded_columns = feature_names.tolist()

# Extract original categorical features by splitting at the first underscore or dot
original_categorical_features = list(set([re.split(r'[_.]', col)[0] for col in one_hot_encoded_columns]))

print("Original Categorical Features:")
print(original_categorical_features)
Original Categorical Features:
['V2188', 'V2163', 'V2116', 'V2134', 'V49', 'sex', 'V2105', 'V2175', 'V2460', 'V2128', 'V2108', 'V2195', 'missing', 'V2181', 'V2183', 'V2140', 'V2143', 'V2193', 'V2197', 'V2153', 'V2101', 'V2186', 'V2166', 'V2178', 'V2157', 'V2156', 'wave', 'V2137', 'V2196', 'V2187', 'V2182', 'V2176', 'V2180', 'V2185', 'V2125', 'V2171', 'RESPONDENT', 'V13', 'V2201', 'V2189', 'V2152', 'V2177', 'V2164', 'race', 'V2155', 'V2179', 'V2194', 'V2184', 'V2191', 'V2173', 'V2172']
In [ ]:
# List of original categorical features
original_categorical_features = [
    'V2137', 'V2172', 'V2181', 'V2178', 'V2134', 'V2163', 'V2197', 'V2188', 'V2191', 'V2155', 
    'V2128', 'V2105', 'V2175', 'V2185', 'V2153', 'V2194', 'V2183', 'V2143', 'V2184', 'V2460', 
    'race', 'V2907', 'V2494', 'RESPONDENT', 'V2164', 'V2146', 'V49', 'V2182', 'V13', 'V2152', 
    'V2176', 'V2196', 'V2187', 'V2173', 'V2108', 'V2033', 'V2177', 'V2030', 'V2171', 'V2119', 
    'V2908', 'V2195', 'V2116', 'V2180', 'V2186', 'V2166', 'V2140', 'V2156', 'V2189', 'V2201', 
    'V2169', 'V2122', 'missing', 'sex', 'V2125', 'V2179', 'V2193', 'V2101', 'wave', 'V2157'
]

# Compute mean absolute SHAP values
mean_abs_shap_values = np.abs(shap_values).mean(axis=0)

# Create a dictionary to map one-hot encoded features to their original features
feature_mapping = {}
for feature in original_categorical_features:
    feature_mapping[feature] = [col for col in feature_names if col.startswith(feature)]

# Aggregate SHAP values for each original feature
aggregated_shap_values = {}
for feature, cols in feature_mapping.items():
    # Find the indices of the one-hot encoded columns for this feature
    indices = [feature_names.tolist().index(col) for col in cols]
    # Sum the mean absolute SHAP values for these columns
    aggregated_shap_values[feature] = np.sum(mean_abs_shap_values[indices])

# Convert the aggregated SHAP values to a DataFrame
aggregated_shap_df = pd.DataFrame({
    'Feature': list(aggregated_shap_values.keys()),
    'Aggregated_SHAP': list(aggregated_shap_values.values())
})

# Sort by aggregated SHAP values in descending order
aggregated_shap_df = aggregated_shap_df.sort_values(by='Aggregated_SHAP', ascending=False)

# Display the top 20 aggregated features
top_n = 20  # Set to 20 for top 20 features
print("Top 20 Aggregated SHAP Features:")
print(aggregated_shap_df.head(top_n))

# Plot the top 20 aggregated features
plt.figure(figsize=(12, 8))
sns.barplot(x='Aggregated_SHAP', y='Feature', data=aggregated_shap_df.head(top_n), palette='viridis')
plt.title(f'Top {top_n} Aggregated SHAP Features')
plt.xlabel('Aggregated SHAP Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Top 20 Aggregated SHAP Features:
   Feature  Aggregated_SHAP
58    wave         0.840751
57   V2101         0.704158
42   V2116         0.459052
11   V2105         0.208962
24   V2164         0.107361
10   V2128         0.087983
55   V2179         0.085944
45   V2166         0.083725
29   V2152         0.074671
56   V2193         0.068521
12   V2175         0.061661
34   V2108         0.056134
30   V2176         0.055857
28     V13         0.054423
18   V2184         0.054002
17   V2143         0.049636
20    race         0.049562
41   V2195         0.048198
33   V2173         0.047251
36   V2177         0.045113
/tmp/ipykernel_1702033/2393572530.py:43: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Aggregated_SHAP', y='Feature', data=aggregated_shap_df.head(top_n), palette='viridis')
No description has been provided for this image
In [ ]:
# Ensure SHAP is installed
import shap
shap.initjs()  # For visualization in notebooks

# Compute SHAP interaction values (using a sample for faster computation)
sample_idx = np.random.choice(X_processed.shape[0], 100, replace=False)  # Use a sample of 100 instances
shap_interaction_values = explainer.shap_interaction_values(X_processed[sample_idx])

# Get feature names from the preprocessor
feature_names = encoded_feature_names  # From your existing code
No description has been provided for this image
In [ ]:
# Step 1: Store interactions and their values, avoiding duplicates and self-interactions
interaction_results = []

for i, feature_i in enumerate(original_categorical_features):
    for j, feature_j in enumerate(original_categorical_features):
        # Skip self-interactions
        if feature_i == feature_j:
            continue

        # Ensure unique pairs by sorting feature names
        feature_pair = tuple(sorted([feature_i, feature_j]))

        # Skip if the pair is already processed
        if feature_pair in [result[0] for result in interaction_results]:
            continue

        # Get indices for feature_i and feature_j
        indices_i = [feature_names.tolist().index(col) for col in feature_mapping[feature_i] if col in feature_names.tolist()]
        indices_j = [feature_names.tolist().index(col) for col in feature_mapping[feature_j] if col in feature_names.tolist()]

        if not indices_i or not indices_j:
            continue

        # Compute interaction value
        value = np.sum(np.abs(shap_interaction_values[:, indices_i, :][:, :, indices_j]))
        interaction_results.append((feature_pair, value))

# Step 2: Sort the interactions by their values
sorted_interactions = sorted(interaction_results, key=lambda x: x[1], reverse=True)

# Step 3: Convert results to a DataFrame for easy aggregation
interaction_df = pd.DataFrame(sorted_interactions, columns=["Feature Pair", "Interaction Value"])
interaction_df[["Feature A", "Feature B"]] = pd.DataFrame(interaction_df["Feature Pair"].tolist(), index=interaction_df.index)
interaction_df = interaction_df.drop(columns=["Feature Pair"])

# Step 4: Select the top 30 interactions
top_30_interactions = interaction_df.sort_values(by="Interaction Value", ascending=False).head(30)

# Step 5: Display the results
print("Top 30 Feature Interactions (Unique Pairs):")
print(top_30_interactions)

# Step 6: Visualize the top 30 interactions
import matplotlib.pyplot as plt

# Create a bar plot
plt.figure(figsize=(12, 8))
plt.barh(
    top_30_interactions.apply(lambda row: f"{row['Feature A']} & {row['Feature B']}", axis=1),
    top_30_interactions["Interaction Value"],
    color="skyblue",
)
plt.xlabel("Interaction Value")
plt.ylabel("Feature Pairs")
plt.title("Top 30 Feature Interactions (Unique Pairs)")
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.tight_layout()
plt.show()
Top 30 Feature Interactions (Unique Pairs):
    Interaction Value Feature A Feature B
0           94.461174     V2116      wave
1           68.112961     V2105      wave
2           57.410591     V2101      wave
3           29.019850     V2101     V2116
4           24.331301     V2105     V2116
5           21.626293     V2108      wave
6           14.987869     V2101     V2105
7           12.275227     V2108     V2116
8           11.319855     V2164      wave
9           11.317589     V2166      wave
10          11.201500      race      wave
11          11.175924     V2196      wave
12          10.676391     V2163      wave
13          10.323991     V2194      wave
14           9.517755     V2179      wave
15           9.090555     V2152      wave
16           8.913261     V2191      wave
17           8.352642     V2176      wave
18           7.616253     V2101     V2108
19           7.319203     V2116     V2166
20           7.314087       V13      wave
21           6.935249     V2173      wave
22           5.815115     V2197      wave
23           5.635795     V2101     V2166
24           5.446934     V2193      wave
25           5.402956     V2195      wave
26           5.175810       V13     V2101
27           5.164973     V2175      wave
28           5.072132     V2172      wave
29           4.982261     V2116     V2152
No description has been provided for this image

CatBoost¶

In [ ]:
import os
import logging
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from scipy.stats import uniform, randint

# Create a pipeline with the preprocessor and CatBoost classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        iterations= 500,  # Increase the number of iterations if needed
        learning_rate=0.1,
        depth=6,
        loss_function='Logloss',
        verbose=0,
        random_seed=RANDOM_STATE
    ))
])

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'classifier__iterations': randint(1000, 5000),  # Wider range for iterations
    'classifier__learning_rate': uniform(0.01, 0.3),  # Wider range for learning rate
    'classifier__depth': randint(4, 12),  # Wider range for depth
    'classifier__l2_leaf_reg': uniform(1e-2, 10),  # L2 regularization
    'classifier__border_count': randint(32, 255),  # Border count
    'classifier__bagging_temperature': uniform(0, 1),  # Bagging temperature
    'classifier__random_strength': uniform(1e-9, 10),  # Random strength
    'classifier__od_type': ['IncToDec', 'Iter'],  # Overfitting detector type
    'classifier__od_wait': randint(10, 50)  # Overfitting detector wait
}

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Increase the number of parameter settings sampled if needed
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
    scoring=SCORING_METRIC,
    n_jobs=CPU_COUNT,
    verbose=VERBOSE,
    random_state=RANDOM_STATE
)

random_search.fit(X_train_with_indicators, y_train)

# Log the best parameters and score
logging.info("Best parameters found: " + str(random_search.best_params_))
logging.info(f"Best cross-validation {SCORING_METRIC}: {random_search.best_score_:.4f}")

# Evaluate the best model
best_model = random_search.best_estimator_

# Transform the data using the fitted preprocessor in the best model
X_train_transformed = best_model.named_steps['preprocessor'].transform(X_train_with_indicators)
X_test_transformed = best_model.named_steps['preprocessor'].transform(X_test_with_indicators)

# Predict and evaluate
y_pred = best_model.predict(X_test_with_indicators)
y_pred_proba = best_model.predict_proba(X_test_with_indicators)[:, 1]

logging.info("=== Best CatBoost Model Evaluation ===")
logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred)))
logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred)))
logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Best CatBoost Model ROC Curve on Test Data')
plt.legend(loc='lower right')
plt.show()

# Save the best model
model_save_path = 'best_catboost_model.pkl'
joblib.dump(best_model, model_save_path)
logging.info(f"Best CatBoost model saved to '{model_save_path}'.")
Fitting 3 folds for each of 20 candidates, totalling 60 fits
2025-02-15 09:46:30,631 - INFO - Best parameters found: {'classifier__bagging_temperature': 0.45924889196586716, 'classifier__border_count': 148, 'classifier__depth': 7, 'classifier__iterations': 3919, 'classifier__l2_leaf_reg': 7.090725777960454, 'classifier__learning_rate': 0.016175348288740735, 'classifier__od_type': 'Iter', 'classifier__od_wait': 33, 'classifier__random_strength': 8.324426409004218}
2025-02-15 09:46:30,633 - INFO - Best cross-validation roc_auc: 0.9065
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
2025-02-15 09:46:31,436 - INFO - === Best CatBoost Model Evaluation ===
2025-02-15 09:46:31,443 - INFO - Confusion Matrix:
[[2365  564]
 [ 580 3037]]
2025-02-15 09:46:31,456 - INFO - 
Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.81      0.81      2929
         1.0       0.84      0.84      0.84      3617

    accuracy                           0.83      6546
   macro avg       0.82      0.82      0.82      6546
weighted avg       0.83      0.83      0.83      6546

2025-02-15 09:46:31,460 - INFO - ROC AUC: 0.9107
No description has been provided for this image
2025-02-15 09:46:31,706 - INFO - Best CatBoost model saved to 'best_catboost_model.pkl'.
In [ ]:
model_filename = os.path.expanduser('~/work/vaping_project_data/best_cb_model.joblib')
joblib.dump(best_model, model_filename)
logging.info(f"Final CatBoost model saved to: {model_filename}")
2025-02-15 09:49:03,429 - INFO - Final CatBoost model saved to: /storage/home/szn5432/work/vaping_project_data/best_cb_model.joblib
In [ ]:
# Load the model (when needed)
file_path = os.path.expanduser('~/work/vaping_project_data/best_cb_model.joblib')
loaded_cb = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
# Function to plot feature importances
def plot_feature_importance(loaded_cb, feature_names, top_n=20, title="Feature Importance"):
    """
    Plots the top N feature importances from a trained model.
    """
    if hasattr(loaded_cb, 'feature_importances_'):
        importances = loaded_cb.feature_importances_
    elif hasattr(loaded_cb, 'named_steps') and 'classifier' in loaded_cb.named_steps:
        if hasattr(loaded_cb.named_steps['classifier'], 'feature_importances_'):
            importances = loaded_cb.named_steps['classifier'].feature_importances_
        else:
            raise ValueError("Classifier does not have feature_importances_ attribute.")
    else:
        raise ValueError("Provided model does not have feature_importances_ attribute.")

    fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    fi_df = fi_df.sort_values('Importance', ascending=False).head(top_n)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=fi_df)
    plt.title(title)
    plt.tight_layout()
    plt.show()

# Get the feature names from the preprocessor
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Plot the top 20 most important features
plot_feature_importance(best_model, feature_names, top_n=20, title="Top 20 Most Important Features")
No description has been provided for this image
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_aggregated_feature_importance(model, top_n=20, title="Top 20 Aggregated Feature Importance"):
    """
    Plots the top N aggregated feature importances for a CatBoost model
    that is wrapped inside a Pipeline with a ColumnTransformer.
    
    Parameters
    ----------
    model : Pipeline
        A scikit-learn Pipeline that includes:
          - 'preprocessor': a ColumnTransformer or other transformer
          - 'classifier': a CatBoostClassifier
    top_n : int, optional (default=20)
        How many top aggregated features to display.
    title : str, optional
        Title of the plot.
    """
    # 1. Get the feature names after the preprocessor step
    feature_names = loaded_cb.named_steps['preprocessor'].get_feature_names_out()
    
    # 2. Get the feature importances from CatBoost
    catboost_estimator = loaded_cb.named_steps['classifier']
    if not hasattr(catboost_estimator, 'feature_importances_'):
        raise AttributeError("The CatBoost classifier does not expose 'feature_importances_'.")

    importances = catboost_estimator.feature_importances_

    # 3. Aggregate importances by original feature
    aggregated_importance = {}
    
    for name, imp in zip(feature_names, importances):
        # Example naming conventions after ColumnTransformer + OneHotEncoder:
        #   "onehotencoder__Gender_Male"
        #   "remainder__Age"
        # Adjust this parsing logic as necessary for your pipeline.
        
        if "__" in name:
            # Split on the double underscore to separate the transformer name vs. the actual column
            parts = name.split("__", maxsplit=1)
            # parts[0] might be 'onehotencoder' or 'remainder'
            # parts[1] might be 'Gender_Male' or 'Age'
            # We'll then split again on '_' if needed to get just the original column name
            col_part = parts[1]
            
            # If the column was numeric (remainder), it may be simply 'Age'.
            # If the column was OHE, it might be 'Gender_Male' or 'Gender_Female'.
            # A simple approach is to take everything before the first underscore as the feature name:
            if "_" in col_part:
                original_feature = col_part.split("_", maxsplit=1)[0]
            else:
                original_feature = col_part
        else:
            # If there's no double underscore, assume the whole name is the feature
            original_feature = name
        
        # Sum up the importances
        aggregated_importance[original_feature] = aggregated_importance.get(original_feature, 0.0) + imp

    # 4. Make a DataFrame of aggregated importances and sort
    agg_df = pd.DataFrame(list(aggregated_importance.items()), columns=["Feature", "Importance"])
    agg_df = agg_df.sort_values("Importance", ascending=False)

    # 5. Plot top N aggregated feature importances
    plt.figure(figsize=(10, 6))
    sns.barplot(x="Importance", y="Feature", data=agg_df.head(top_n))
    plt.title(title)
    plt.tight_layout()
    plt.show()


# --- Usage Example ---
# Assuming you have your best_model pipeline (with 'preprocessor' and CatBoost 'classifier'):

plot_aggregated_feature_importance(best_model, top_n=20, title="Top 20 Aggregated Feature Importance")
No description has been provided for this image
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay


def get_top_aggregated_features(loaded_cb, top_n=10):
    """
    Returns a list of top_n original (aggregated) feature names
    based on the CatBoost feature importances in a Pipeline.
    
    Parameters
    ----------
    model : Pipeline
        A scikit-learn Pipeline with steps:
          - "preprocessor" : ColumnTransformer (or similar)
          - "classifier" : CatBoostClassifier
    top_n : int
        Number of top features to return
        
    Returns
    -------
    list of str
        Top N aggregated feature names
    """
    # Extract feature names that come out of the preprocessor
    feature_names = loaded_cb.named_steps['preprocessor'].get_feature_names_out()
    
    # Extract importances from CatBoost
    catboost_estimator = loaded_cb.named_steps['classifier']
    importances = catboost_estimator.feature_importances_

    # Aggregate importances by the original (pre-encoding) feature name
    aggregated_importance = {}
    for name, imp in zip(feature_names, importances):
        if "__" in name:
            # Example: "onehotencoder__Gender_Male" -> original_feature = "Gender"
            parts = name.split("__", maxsplit=1)
            col_part = parts[1]
            if "_" in col_part: 
                # For OHE columns like "Gender_Male"
                original_feature = col_part.split("_", maxsplit=1)[0]
            else:
                # For remainder numeric columns
                original_feature = col_part
        else:
            # If no __, assume name is the feature
            original_feature = name
        
        aggregated_importance[original_feature] = (
            aggregated_importance.get(original_feature, 0.0) + imp
        )

    # Turn into a DataFrame, sort, and get top_n
    agg_df = pd.DataFrame(
        list(aggregated_importance.items()), 
        columns=["Feature", "Importance"]
    ).sort_values("Importance", ascending=False)

    return agg_df.head(top_n)["Feature"].tolist()

# 1. Get the top 10 features by aggregated importance
top_features = get_top_aggregated_features(loaded_cb, top_n=10)
print("Top 10 aggregated features:\n", top_features)

# Plot the partial dependence plot
for feat in top_features:
    if feat not in X_train_with_indicators.columns:
        print(f"Skipping feature '{feat}' as it is not found in the DataFrame.")
        continue

    fig, ax = plt.subplots(figsize=(6, 4))
    PartialDependenceDisplay.from_estimator(
        estimator=loaded_cb,
        X=X_train_with_indicators,
        features=[feat],
        kind='average',
        grid_resolution=50,
        target=1,  # positive class for binary classification
        ax=ax
    )
    plt.title(f"Partial Dependence of {feat}")
    plt.show()
Top 10 aggregated features:
 ['wave', 'V2116', 'V2105', 'V2101', 'race', 'V2196', 'V2152', 'V2191', 'V2179', 'V2108']
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
import shap
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

X_train_transformed = loaded_cb.named_steps['preprocessor'].transform(X_train_with_indicators)


def plot_top_shap_aggregated_features(model, X_train_transformed, feature_names, top_n=20, title="Top 20 SHAP Aggregated Features"):
    """
    Plots the top N aggregated SHAP feature importances for a model.
    
    Parameters
    ----------
    model : Pipeline
        A scikit-learn Pipeline that includes:
          - 'preprocessor': a ColumnTransformer or other transformer
          - 'classifier': a CatBoostClassifier
    X_train_transformed : array-like
        Transformed training data (output of the preprocessor).
    feature_names : array-like
        Feature names after transformation.
    top_n : int, optional (default=20)
        How many top aggregated features to display.
    title : str, optional
        Title of the plot.
    """
    # 1. Extract the classifier from the pipeline
    catboost_estimator = loaded_cb.named_steps['classifier']
    
    # 2. Initialize SHAP explainer for the CatBoost model
    explainer = shap.TreeExplainer(catboost_estimator)
    
    # 3. Compute SHAP values for the transformed training data
    shap_values = explainer.shap_values(X_train_transformed)

    # For classification problems, shap_values is a list (one element per class).
    # We use the positive class for binary classification (index 1).
    if isinstance(shap_values, list):
        shap_values = shap_values[1]
    
    # 4. Aggregate SHAP values back to the original feature names
    aggregated_shap = {}
    for i, name in enumerate(feature_names):
        # Parse original feature name from encoded feature name
        if "__" in name:
            col_part = name.split("__", maxsplit=1)[1]  # Split after the double underscore
            if "_" in col_part:
                original_feature = col_part.split("_", maxsplit=1)[0]
            else:
                original_feature = col_part
        else:
            original_feature = name

        # Sum SHAP values for the same original feature
        aggregated_shap[original_feature] = aggregated_shap.get(original_feature, 0.0) + abs(shap_values[:, i]).mean()
    
    # 5. Create a DataFrame of aggregated SHAP values and sort by importance
    shap_df = pd.DataFrame(list(aggregated_shap.items()), columns=["Feature", "SHAP Importance"])
    shap_df = shap_df.sort_values("SHAP Importance", ascending=False)

    # 6. Plot the top N SHAP aggregated feature importances
    plt.figure(figsize=(10, 6))
    sns.barplot(x="SHAP Importance", y="Feature", data=shap_df.head(top_n))
    plt.title(title)
    plt.tight_layout()
    plt.show()


# --- Usage Example ---
# Assuming you have:
# - 'best_model': your fitted pipeline
# - 'X_train_transformed': the transformed training data
# - 'feature_names': the output of 'get_feature_names_out()' from the preprocessor

plot_top_shap_aggregated_features(
     model=best_model,
     X_train_transformed=X_train_transformed,
     feature_names=best_model.named_steps['preprocessor'].get_feature_names_out(),
     top_n=20,
     title="Top 20 SHAP Aggregated Features"
)
No description has been provided for this image
In [ ]:
###################################
# SHAP Interactions
###################################

import shap
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Extract the classifier from the pipeline ---
catboost_estimator = loaded_cb.named_steps['classifier']

# --- Step 2: Initialize SHAP explainer ---
explainer = shap.TreeExplainer(catboost_estimator)
In [ ]:
# Sample a subset of the training data
sample_size = 2  # Adjust based on available memory and dataset size
X_train_sample = X_train_with_indicators.sample(n=sample_size, random_state=42)

# Transform the sampled data
X_train_sample_transformed = loaded_cb.named_steps['preprocessor'].transform(X_train_sample)

# Compute SHAP interaction values for the sample
interaction_values = explainer.shap_interaction_values(X_train_sample_transformed)
In [ ]:
import numpy as np
import pandas as pd

# List of original categorical features
original_categorical_features = [
    'V2137', 'V2172', 'V2181', 'V2178', 'V2134', 'V2163', 'V2197', 'V2188', 'V2191', 'V2155', 
    'V2128', 'V2105', 'V2175', 'V2185', 'V2153', 'V2194', 'V2183', 'V2143', 'V2184', 'V2460', 
    'race', 'V2907', 'V2494', 'RESPONDENT', 'V2164', 'V2146', 'V49', 'V2182', 'V13', 'V2152', 
    'V2176', 'V2196', 'V2187', 'V2173', 'V2108', 'V2033', 'V2177', 'V2030', 'V2171', 'V2119', 
    'V2908', 'V2195', 'V2116', 'V2180', 'V2186', 'V2166', 'V2140', 'V2156', 'V2189', 'V2201', 
    'V2169', 'V2122', 'missing', 'sex', 'V2125', 'V2179', 'V2193', 'V2101', 'wave', 'V2157'
]

# Get the feature names from your fitted pipeline
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Inspect them to see how they're named
print("Transformed feature names:\n", feature_names)

# A helper function to check if a transformed column belongs to a given original feature
def belongs_to_original(col_name: str, original_feat: str) -> bool:
    parts = col_name.split("__", maxsplit=1)
    if len(parts) == 2:
        encoded_part = parts[1]  # e.g. "V2137_0"
    else:
        encoded_part = parts[0]
    return (encoded_part == original_feat) or encoded_part.startswith(original_feat + "_")

# Create a dictionary to map each original categorical feature to its transformed columns
feature_mapping = {}
for feature in original_categorical_features:
    matched_cols = [
        col for col in feature_names 
        if belongs_to_original(col, feature)
    ]
    feature_mapping[feature] = matched_cols

# Now do your SHAP interaction aggregation
aggregated_interaction_matrix = np.zeros((len(original_categorical_features), len(original_categorical_features)))

for i, feature_i in enumerate(original_categorical_features):
    for j, feature_j in enumerate(original_categorical_features):
        indices_i = [feature_names.tolist().index(c) for c in feature_mapping[feature_i]]
        indices_j = [feature_names.tolist().index(c) for c in feature_mapping[feature_j]]
        
        # Sum absolute interaction values for these columns
        aggregated_interaction_matrix[i, j] = np.sum(np.abs(interaction_values[:, indices_i, :][:, :, indices_j]))

aggregated_interaction_df = pd.DataFrame(
    aggregated_interaction_matrix,
    index=original_categorical_features,
    columns=original_categorical_features
)

print("Aggregated Interaction DataFrame:\n", aggregated_interaction_df)
Transformed feature names:
 ['cat__V2178_2.0' 'cat__V2178_3.0' 'cat__V2178_4.0' 'cat__V2178_5.0'
 'cat__V2178_6.0' 'cat__V2178_nan' 'cat__V2188_1.0' 'cat__V2188_nan'
 'cat__V2197_1.0' 'cat__V2197_2.0' 'cat__V2197_3.0' 'cat__V2197_4.0'
 'cat__V2197_nan' 'cat__V2184_2.0' 'cat__V2184_3.0' 'cat__V2184_4.0'
 'cat__V2184_nan' 'cat__V2186_1.0' 'cat__V2186_nan' 'cat__V2171_2.0'
 'cat__V2171_6.0' 'cat__V2171_nan' 'cat__V2128_2.0' 'cat__V2128_3.0'
 'cat__V2128_4.0' 'cat__V2128_5.0' 'cat__V2128_6.0' 'cat__V2128_7.0'
 'cat__V2128_nan' 'cat__V2201_1.0' 'cat__V2201_2.0' 'cat__V2201_3.0'
 'cat__V2201_4.0' 'cat__V2201_nan' 'cat__V2173_2.0' 'cat__V2173_3.0'
 'cat__V2173_4.0' 'cat__V2173_5.0' 'cat__V2173_6.0' 'cat__V2173_7.0'
 'cat__V2173_nan' 'cat__V2194_2.0' 'cat__V2194_3.0' 'cat__V2194_4.0'
 'cat__V2194_5.0' 'cat__V2194_6.0' 'cat__V2194_nan' 'cat__V2166_2.0'
 'cat__V2166_3.0' 'cat__V2166_4.0' 'cat__V2166_5.0' 'cat__V2166_6.0'
 'cat__V2166_7.0' 'cat__V2166_8.0' 'cat__V2166_nan' 'cat__wave_2018'
 'cat__wave_2019' 'cat__wave_2020' 'cat__wave_2021' 'cat__wave_2022'
 'cat__wave_2023' 'cat__V2176_2.0' 'cat__V2176_3.0' 'cat__V2176_4.0'
 'cat__V2176_5.0' 'cat__V2176_6.0' 'cat__V2176_7.0' 'cat__V2176_nan'
 'cat__V2175_2.0' 'cat__V2175_3.0' 'cat__V2175_4.0' 'cat__V2175_5.0'
 'cat__V2175_6.0' 'cat__V2175_7.0' 'cat__V2175_nan' 'cat__V2177_2.0'
 'cat__V2177_3.0' 'cat__V2177_4.0' 'cat__V2177_5.0' 'cat__V2177_6.0'
 'cat__V2177_7.0' 'cat__V2177_nan' 'cat__V2116_2.0' 'cat__V2116_3.0'
 'cat__V2116_4.0' 'cat__V2116_5.0' 'cat__V2116_6.0' 'cat__V2116_7.0'
 'cat__V2116_nan' 'cat__V2125_2.0' 'cat__V2125_3.0' 'cat__V2125_4.0'
 'cat__V2125_5.0' 'cat__V2125_6.0' 'cat__V2125_7.0' 'cat__V2125_nan'
 'cat__V2182_2.0' 'cat__V2182_3.0' 'cat__V2182_4.0' 'cat__V2182_nan'
 'cat__sex_1.0' 'cat__race_2.0' 'cat__race_3.0' 'cat__V2460_2.0'
 'cat__V2460_3.0' 'cat__V2460_4.0' 'cat__V2460_5.0' 'cat__V2460_6.0'
 'cat__V2460_7.0' 'cat__V2460_nan' 'cat__RESPONDENT_AGE_2.0'
 'cat__RESPONDENT_AGE_nan' 'cat__V2185_1.0' 'cat__V2185_nan'
 'cat__V2193_2.0' 'cat__V2193_3.0' 'cat__V2193_4.0' 'cat__V2193_5.0'
 'cat__V2193_6.0' 'cat__V2193_7.0' 'cat__V2193_8.0' 'cat__V2193_9.0'
 'cat__V2193_10.0' 'cat__V2193_nan' 'cat__V2163_2.0' 'cat__V2163_3.0'
 'cat__V2163_4.0' 'cat__V2163_5.0' 'cat__V2163_6.0' 'cat__V2163_7.0'
 'cat__V2163_nan' 'cat__V49_1.0' 'cat__V49_2.0' 'cat__V49_3.0'
 'cat__V49_nan' 'cat__V2108_2.0' 'cat__V2108_3.0' 'cat__V2108_4.0'
 'cat__V2108_5.0' 'cat__V2108_6.0' 'cat__V2108_nan' 'cat__V2101_2.0'
 'cat__V2101_3.0' 'cat__V2101_4.0' 'cat__V2101_5.0' 'cat__V2101_nan'
 'cat__V2180_2.0' 'cat__V2180_3.0' 'cat__V2180_4.0' 'cat__V2180_nan'
 'cat__V2164_2.0' 'cat__V2164_3.0' 'cat__V2164_4.0' 'cat__V2164_5.0'
 'cat__V2164_6.0' 'cat__V2164_7.0' 'cat__V2164_nan' 'cat__V2191_2.0'
 'cat__V2191_3.0' 'cat__V2191_4.0' 'cat__V2191_5.0' 'cat__V2191_6.0'
 'cat__V2191_7.0' 'cat__V2191_8.0' 'cat__V2191_nan' 'cat__V2195_2.0'
 'cat__V2195_3.0' 'cat__V2195_4.0' 'cat__V2195_5.0' 'cat__V2195_6.0'
 'cat__V2195_nan' 'cat__V2155_1.0' 'cat__V2155_nan' 'cat__V2196_2.0'
 'cat__V2196_3.0' 'cat__V2196_4.0' 'cat__V2196_5.0' 'cat__V2196_6.0'
 'cat__V2196_nan' 'cat__V2189_1.0' 'cat__V2189_nan' 'cat__V2179_2.0'
 'cat__V2179_3.0' 'cat__V2179_4.0' 'cat__V2179_5.0' 'cat__V2179_6.0'
 'cat__V2179_7.0' 'cat__V2179_8.0' 'cat__V2179_9.0' 'cat__V2179_nan'
 'cat__V13_2' 'cat__V13_3' 'cat__V13_4' 'cat__V2143_2.0' 'cat__V2143_3.0'
 'cat__V2143_4.0' 'cat__V2143_5.0' 'cat__V2143_6.0' 'cat__V2143_7.0'
 'cat__V2143_nan' 'cat__V2134_2.0' 'cat__V2134_3.0' 'cat__V2134_4.0'
 'cat__V2134_5.0' 'cat__V2134_6.0' 'cat__V2134_7.0' 'cat__V2134_nan'
 'cat__V2172_2.0' 'cat__V2172_3.0' 'cat__V2172_4.0' 'cat__V2172_nan'
 'cat__V2137_2.0' 'cat__V2137_3.0' 'cat__V2137_4.0' 'cat__V2137_5.0'
 'cat__V2137_6.0' 'cat__V2137_7.0' 'cat__V2137_nan' 'cat__V2140_2.0'
 'cat__V2140_4.0' 'cat__V2140_5.0' 'cat__V2140_6.0' 'cat__V2140_7.0'
 'cat__V2140_nan' 'cat__V2105_2.0' 'cat__V2105_3.0' 'cat__V2105_4.0'
 'cat__V2105_5.0' 'cat__V2105_6.0' 'cat__V2105_7.0' 'cat__V2105_nan'
 'cat__V2157_1.0' 'cat__V2157_nan' 'cat__V2183_2.0' 'cat__V2183_3.0'
 'cat__V2183_4.0' 'cat__V2183_nan' 'cat__V2187_1.0' 'cat__V2187_nan'
 'cat__V2181_2.0' 'cat__V2181_3.0' 'cat__V2181_4.0' 'cat__V2181_nan'
 'cat__V2152_1.0' 'cat__V2152_2.0' 'cat__V2152_3.0' 'cat__V2152_4.0'
 'cat__V2152_5.0' 'cat__V2152_6.0' 'cat__V2152_7.0' 'cat__V2152_8.0'
 'cat__V2152_9.0' 'cat__V2152_nan' 'cat__V2153_2.0' 'cat__V2153_3.0'
 'cat__V2153_4.0' 'cat__V2153_nan' 'cat__V2156_1.0' 'cat__V2156_nan'
 'cat__missing_V2178_True' 'cat__missing_V2188_True'
 'cat__missing_V2197_True' 'cat__missing_V2184_True'
 'cat__missing_V2186_True' 'cat__missing_V2171_True'
 'cat__missing_V2128_True' 'cat__missing_V2201_True'
 'cat__missing_V2173_True' 'cat__missing_V2194_True'
 'cat__missing_V2166_True' 'cat__missing_V2176_True'
 'cat__missing_V2175_True' 'cat__missing_V2177_True'
 'cat__missing_V2116_True' 'cat__missing_V2125_True'
 'cat__missing_V2182_True' 'cat__missing_V2460_True'
 'cat__missing_RESPONDENT_AGE_True' 'cat__missing_V2185_True'
 'cat__missing_V2193_True' 'cat__missing_V2163_True'
 'cat__missing_V49_True' 'cat__missing_V2108_True'
 'cat__missing_V2101_True' 'cat__missing_V2180_True'
 'cat__missing_V2164_True' 'cat__missing_V2191_True'
 'cat__missing_V2195_True' 'cat__missing_V2155_True'
 'cat__missing_V2196_True' 'cat__missing_V2189_True'
 'cat__missing_V2179_True' 'cat__missing_V2143_True'
 'cat__missing_V2134_True' 'cat__missing_V2172_True'
 'cat__missing_V2137_True' 'cat__missing_V2140_True'
 'cat__missing_V2105_True' 'cat__missing_V2157_True'
 'cat__missing_V2183_True' 'cat__missing_V2187_True'
 'cat__missing_V2181_True' 'cat__missing_V2152_True'
 'cat__missing_V2153_True' 'cat__missing_V2156_True']
Aggregated Interaction DataFrame:
                    V2137     V2172     V2181     V2178         V2134  \
V2137       3.290348e-03  0.000058  0.000057  0.000046  1.228783e-05   
V2172       5.801001e-05  0.036453  0.001411  0.000912  2.559703e-04   
V2181       5.727996e-05  0.001411  0.015649  0.000689  2.132224e-05   
V2178       4.601106e-05  0.000912  0.000689  0.017057  1.084229e-04   
V2134       1.228783e-05  0.000256  0.000021  0.000108  3.977924e-03   
V2163       1.619578e-04  0.002878  0.001778  0.002497  1.119420e-03   
V2197       4.423864e-05  0.001065  0.000123  0.000389  3.886163e-05   
V2188       2.309646e-04  0.000351  0.001489  0.001942  5.925815e-04   
V2191       1.115818e-04  0.002676  0.001730  0.001493  1.027248e-04   
V2155       3.853873e-07  0.000674  0.001473  0.000789  8.013086e-08   
V2128       1.045936e-04  0.000170  0.000046  0.000043  3.988419e-06   
V2105       2.061868e-04  0.012276  0.006536  0.021041  5.858707e-04   
V2175       5.298451e-05  0.002327  0.001250  0.001751  8.722402e-05   
V2185       1.022528e-05  0.000353  0.000503  0.000028  1.659341e-05   
V2153       3.865524e-05  0.001057  0.000200  0.000105  1.011352e-05   
V2194       3.518209e-04  0.002630  0.001592  0.001795  2.106592e-04   
V2183       4.126358e-04  0.002614  0.001698  0.001640  1.325228e-04   
V2143       6.160799e-05  0.000144  0.000035  0.000030  5.761200e-06   
V2184       1.179433e-04  0.010305  0.001919  0.002645  1.882302e-04   
V2460       4.623492e-06  0.000091  0.000040  0.000063  3.028373e-06   
race        4.579310e-05  0.000269  0.000887  0.001670  1.160390e-04   
V2907       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V2494       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
RESPONDENT  6.074983e-05  0.000633  0.000610  0.000591  4.204937e-05   
V2164       4.390109e-04  0.003228  0.001492  0.003086  4.721385e-04   
V2146       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V49         1.367812e-04  0.002162  0.000863  0.003248  7.521613e-05   
V2182       6.599708e-05  0.002539  0.000586  0.000529  8.014844e-05   
V13         1.001896e-04  0.002855  0.001863  0.003177  2.432611e-05   
V2152       2.515096e-04  0.002587  0.001081  0.001536  3.369002e-04   
V2176       9.971148e-05  0.003391  0.000948  0.003888  6.190503e-05   
V2196       6.279371e-05  0.002950  0.003280  0.002430  4.967960e-05   
V2187       7.196244e-07  0.000026  0.000208  0.004478  1.288283e-05   
V2173       1.458582e-04  0.001530  0.001926  0.005064  2.275831e-04   
V2108       4.532427e-05  0.000435  0.000469  0.001280  3.270080e-05   
V2033       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V2177       5.123756e-05  0.003809  0.001393  0.001134  3.953874e-04   
V2030       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V2171       3.025585e-06  0.000046  0.000031  0.000029  2.299128e-07   
V2119       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V2908       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V2195       6.728445e-05  0.002004  0.001184  0.000895  4.059375e-05   
V2116       2.368459e-04  0.004952  0.001921  0.002697  1.920520e-04   
V2180       5.100981e-05  0.002118  0.000804  0.001856  1.427523e-04   
V2186       1.284499e-06  0.000304  0.000145  0.000033  1.145650e-06   
V2166       2.849611e-04  0.007115  0.001056  0.002999  6.797044e-04   
V2140       2.244612e-06  0.000019  0.000003  0.000005  2.623857e-05   
V2156       1.591964e-05  0.000763  0.000048  0.000230  0.000000e+00   
V2189       1.809382e-05  0.000265  0.000143  0.000152  2.749115e-05   
V2201       2.057817e-05  0.000705  0.000431  0.000557  2.771747e-05   
V2169       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
V2122       0.000000e+00  0.000000  0.000000  0.000000  0.000000e+00   
missing     1.302991e-04  0.001753  0.001000  0.001227  1.443024e-04   
sex         4.855872e-05  0.002203  0.003122  0.001071  2.214620e-05   
V2125       6.945713e-06  0.000120  0.000078  0.000046  4.987354e-06   
V2179       8.244927e-05  0.005191  0.001297  0.006686  2.424108e-04   
V2193       1.410531e-04  0.001559  0.001805  0.006972  8.529841e-05   
V2101       6.977184e-05  0.000516  0.000538  0.000440  6.703475e-05   
wave        1.441623e-03  0.022204  0.012419  0.024816  9.281955e-04   
V2157       9.000518e-05  0.011629  0.000145  0.000514  4.520196e-05   

               V2163     V2197     V2188     V2191         V2155  ...  V2169  \
V2137       0.000162  0.000044  0.000231  0.000112  3.853873e-07  ...    0.0   
V2172       0.002878  0.001065  0.000351  0.002676  6.741730e-04  ...    0.0   
V2181       0.001778  0.000123  0.001489  0.001730  1.472853e-03  ...    0.0   
V2178       0.002497  0.000389  0.001942  0.001493  7.889872e-04  ...    0.0   
V2134       0.001119  0.000039  0.000593  0.000103  8.013086e-08  ...    0.0   
V2163       0.031062  0.000350  0.011878  0.006844  1.333255e-03  ...    0.0   
V2197       0.000350  0.012217  0.000122  0.001059  3.766781e-04  ...    0.0   
V2188       0.011878  0.000122  0.056294  0.001452  0.000000e+00  ...    0.0   
V2191       0.006844  0.001059  0.001452  0.062045  1.533827e-03  ...    0.0   
V2155       0.001333  0.000377  0.000000  0.001534  2.150288e-02  ...    0.0   
V2128       0.000125  0.000111  0.000123  0.000116  1.529010e-05  ...    0.0   
V2105       0.025624  0.002345  0.025587  0.022655  1.138207e-02  ...    0.0   
V2175       0.004807  0.000904  0.000683  0.003781  1.665315e-03  ...    0.0   
V2185       0.000664  0.000151  0.000085  0.000612  4.174384e-04  ...    0.0   
V2153       0.000943  0.000628  0.001532  0.001256  5.241064e-05  ...    0.0   
V2194       0.005244  0.000568  0.002509  0.005804  8.418942e-04  ...    0.0   
V2183       0.013637  0.001147  0.000961  0.004148  2.265506e-03  ...    0.0   
V2143       0.000097  0.000022  0.000074  0.000103  2.758811e-05  ...    0.0   
V2184       0.010678  0.000499  0.007488  0.007492  1.059992e-03  ...    0.0   
V2460       0.000074  0.000127  0.000000  0.000045  2.408255e-05  ...    0.0   
race        0.004662  0.001383  0.007512  0.002647  3.265106e-04  ...    0.0   
V2907       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V2494       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
RESPONDENT  0.006159  0.000354  0.002969  0.002256  7.573764e-04  ...    0.0   
V2164       0.010029  0.003265  0.003100  0.009492  1.225573e-03  ...    0.0   
V2146       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V49         0.009347  0.001981  0.006958  0.002406  1.532764e-03  ...    0.0   
V2182       0.004331  0.000264  0.000793  0.003398  1.252161e-03  ...    0.0   
V13         0.004877  0.004062  0.005611  0.011941  5.255729e-04  ...    0.0   
V2152       0.013849  0.001526  0.001883  0.016666  5.542620e-03  ...    0.0   
V2176       0.005450  0.007375  0.001302  0.006005  1.173518e-03  ...    0.0   
V2196       0.005582  0.003381  0.003380  0.011326  3.780921e-03  ...    0.0   
V2187       0.001510  0.000596  0.000147  0.001088  1.814104e-03  ...    0.0   
V2173       0.005793  0.001092  0.001874  0.005735  1.039251e-03  ...    0.0   
V2108       0.000918  0.000370  0.000438  0.000791  1.470156e-04  ...    0.0   
V2033       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V2177       0.012957  0.000745  0.002374  0.005384  3.641342e-03  ...    0.0   
V2030       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V2171       0.000028  0.000005  0.000149  0.000250  7.811983e-05  ...    0.0   
V2119       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V2908       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V2195       0.003358  0.000906  0.001512  0.003992  2.270690e-03  ...    0.0   
V2116       0.015932  0.002656  0.007450  0.017679  1.040673e-03  ...    0.0   
V2180       0.000963  0.001108  0.001647  0.003240  4.879539e-04  ...    0.0   
V2186       0.000373  0.000163  0.000411  0.001692  6.472437e-04  ...    0.0   
V2166       0.026191  0.001771  0.001285  0.008454  1.530930e-03  ...    0.0   
V2140       0.000014  0.000007  0.000003  0.000007  1.202832e-05  ...    0.0   
V2156       0.001183  0.000168  0.000033  0.000645  4.001194e-04  ...    0.0   
V2189       0.002986  0.000187  0.000415  0.002361  8.053417e-04  ...    0.0   
V2201       0.001523  0.000225  0.001064  0.001286  1.715401e-04  ...    0.0   
V2169       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
V2122       0.000000  0.000000  0.000000  0.000000  0.000000e+00  ...    0.0   
missing     0.004868  0.000800  0.001602  0.002177  5.546067e-04  ...    0.0   
sex         0.003147  0.000483  0.006887  0.001303  1.305297e-03  ...    0.0   
V2125       0.000083  0.000073  0.000033  0.000060  1.834481e-05  ...    0.0   
V2179       0.016220  0.002975  0.002036  0.009874  3.068118e-03  ...    0.0   
V2193       0.006381  0.000423  0.001087  0.005248  1.135447e-03  ...    0.0   
V2101       0.004093  0.001623  0.001868  0.002238  1.077280e-03  ...    0.0   
wave        0.041002  0.030261  0.012158  0.042805  1.236974e-02  ...    0.0   
V2157       0.014435  0.000607  0.000000  0.003810  6.681082e-04  ...    0.0   

            V2122   missing       sex         V2125     V2179     V2193  \
V2137         0.0  0.000130  0.000049  6.945713e-06  0.000082  0.000141   
V2172         0.0  0.001753  0.002203  1.202531e-04  0.005191  0.001559   
V2181         0.0  0.001000  0.003122  7.846045e-05  0.001297  0.001805   
V2178         0.0  0.001227  0.001071  4.618155e-05  0.006686  0.006972   
V2134         0.0  0.000144  0.000022  4.987354e-06  0.000242  0.000085   
V2163         0.0  0.004868  0.003147  8.287642e-05  0.016220  0.006381   
V2197         0.0  0.000800  0.000483  7.336517e-05  0.002975  0.000423   
V2188         0.0  0.001602  0.006887  3.296022e-05  0.002036  0.001087   
V2191         0.0  0.002177  0.001303  5.978814e-05  0.009874  0.005248   
V2155         0.0  0.000555  0.001305  1.834481e-05  0.003068  0.001135   
V2128         0.0  0.000166  0.000235  5.278031e-05  0.000189  0.000052   
V2105         0.0  0.011739  0.033367  1.155057e-04  0.063313  0.010649   
V2175         0.0  0.002553  0.004833  9.044859e-05  0.015161  0.006784   
V2185         0.0  0.000344  0.000066  4.023579e-06  0.001705  0.000874   
V2153         0.0  0.001123  0.000627  2.541190e-05  0.002273  0.001496   
V2194         0.0  0.002683  0.010773  2.117903e-04  0.008950  0.002081   
V2183         0.0  0.004035  0.001675  9.323043e-05  0.018317  0.007331   
V2143         0.0  0.000110  0.000154  1.070003e-05  0.000069  0.000230   
V2184         0.0  0.002421  0.002722  1.764218e-04  0.003710  0.001853   
V2460         0.0  0.000169  0.000033  3.036354e-06  0.000145  0.000070   
race          0.0  0.002807  0.011958  1.800773e-04  0.005492  0.005691   
V2907         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V2494         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
RESPONDENT    0.0  0.001436  0.000693  3.752440e-06  0.004736  0.000632   
V2164         0.0  0.003845  0.003471  1.827520e-04  0.014087  0.025429   
V2146         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V49           0.0  0.002933  0.005475  8.508893e-05  0.007755  0.004193   
V2182         0.0  0.003147  0.002015  7.989134e-05  0.007934  0.002508   
V13           0.0  0.002522  0.004142  3.941463e-05  0.015365  0.003531   
V2152         0.0  0.003885  0.018970  8.081511e-05  0.017696  0.005216   
V2176         0.0  0.002552  0.002989  2.218161e-05  0.007453  0.008351   
V2196         0.0  0.002644  0.006379  1.216230e-04  0.012311  0.015221   
V2187         0.0  0.000481  0.001689  8.803605e-06  0.002079  0.000689   
V2173         0.0  0.003842  0.009092  9.267340e-05  0.016540  0.006551   
V2108         0.0  0.000744  0.000395  7.392260e-05  0.001142  0.001002   
V2033         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V2177         0.0  0.003019  0.001743  2.240288e-04  0.012049  0.005962   
V2030         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V2171         0.0  0.000151  0.000010  6.784830e-08  0.000110  0.000179   
V2119         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V2908         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V2195         0.0  0.001795  0.003817  8.267769e-05  0.005848  0.004778   
V2116         0.0  0.007601  0.013019  4.775995e-04  0.017196  0.006381   
V2180         0.0  0.002693  0.002132  2.964032e-05  0.007085  0.001735   
V2186         0.0  0.000395  0.000938  2.823976e-06  0.001776  0.000801   
V2166         0.0  0.006739  0.005281  8.856284e-05  0.011620  0.006439   
V2140         0.0  0.000137  0.000004  4.850142e-06  0.000063  0.000022   
V2156         0.0  0.000538  0.000104  9.452533e-06  0.000524  0.000160   
V2189         0.0  0.002018  0.002574  2.279768e-05  0.001818  0.002545   
V2201         0.0  0.000851  0.000582  1.916033e-05  0.005652  0.001051   
V2169         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
V2122         0.0  0.000000  0.000000  0.000000e+00  0.000000  0.000000   
missing       0.0  0.063807  0.002482  2.820202e-04  0.004974  0.002075   
sex           0.0  0.002482  0.222048  1.229450e-04  0.012901  0.003316   
V2125         0.0  0.000282  0.000123  2.467815e-03  0.000092  0.000045   
V2179         0.0  0.004974  0.012901  9.169292e-05  0.113295  0.005443   
V2193         0.0  0.002075  0.003316  4.488613e-05  0.005443  0.116065   
V2101         0.0  0.001910  0.001581  4.439017e-05  0.005998  0.000927   
wave          0.0  0.033823  0.063628  1.027013e-03  0.120772  0.042560   
V2157         0.0  0.003805  0.001879  1.753067e-05  0.009146  0.003135   

               V2101      wave         V2157  
V2137       0.000070  0.001442  9.000518e-05  
V2172       0.000516  0.022204  1.162857e-02  
V2181       0.000538  0.012419  1.452850e-04  
V2178       0.000440  0.024816  5.143092e-04  
V2134       0.000067  0.000928  4.520196e-05  
V2163       0.004093  0.041002  1.443499e-02  
V2197       0.001623  0.030261  6.065777e-04  
V2188       0.001868  0.012158  0.000000e+00  
V2191       0.002238  0.042805  3.810327e-03  
V2155       0.001077  0.012370  6.681082e-04  
V2128       0.000178  0.004767  5.427367e-05  
V2105       0.010748  1.136753  1.513988e-02  
V2175       0.001713  0.051602  3.533106e-03  
V2185       0.000156  0.004576  3.571820e-04  
V2153       0.000362  0.007102  2.383781e-03  
V2194       0.002464  0.104285  2.547308e-03  
V2183       0.003693  0.070119  3.732516e-03  
V2143       0.000101  0.001867  5.224842e-05  
V2184       0.003229  0.064718  3.160546e-03  
V2460       0.000047  0.000748  1.694982e-04  
race        0.004120  0.323612  3.675930e-03  
V2907       0.000000  0.000000  0.000000e+00  
V2494       0.000000  0.000000  0.000000e+00  
RESPONDENT  0.002167  0.028003  2.799535e-04  
V2164       0.002443  0.066317  4.266689e-03  
V2146       0.000000  0.000000  0.000000e+00  
V49         0.002219  0.023401  1.026972e-02  
V2182       0.001238  0.021054  2.189647e-03  
V13         0.003105  0.058597  3.486217e-03  
V2152       0.003383  0.099763  8.783717e-03  
V2176       0.001735  0.096700  1.425147e-02  
V2196       0.004824  0.218511  4.931574e-03  
V2187       0.000373  0.004686  2.159413e-04  
V2173       0.004344  0.092705  4.704848e-03  
V2108       0.001348  0.089846  1.385032e-03  
V2033       0.000000  0.000000  0.000000e+00  
V2177       0.004121  0.077873  9.695057e-03  
V2030       0.000000  0.000000  0.000000e+00  
V2171       0.000040  0.000628  2.726120e-04  
V2119       0.000000  0.000000  0.000000e+00  
V2908       0.000000  0.000000  0.000000e+00  
V2195       0.000828  0.042655  1.049865e-03  
V2116       0.011746  1.455743  1.703481e-02  
V2180       0.001427  0.020008  2.303820e-03  
V2186       0.000076  0.008708  9.195180e-04  
V2166       0.004499  0.068395  3.037345e-02  
V2140       0.000010  0.000188  6.438142e-07  
V2156       0.000282  0.003137  6.100717e-04  
V2189       0.000370  0.010539  1.024734e-03  
V2201       0.000502  0.038261  1.902385e-03  
V2169       0.000000  0.000000  0.000000e+00  
V2122       0.000000  0.000000  0.000000e+00  
missing     0.001910  0.033823  3.805279e-03  
sex         0.001581  0.063628  1.879282e-03  
V2125       0.000044  0.001027  1.753067e-05  
V2179       0.005998  0.120772  9.146010e-03  
V2193       0.000927  0.042560  3.134862e-03  
V2101       0.103005  0.317949  6.550920e-04  
wave        0.317949  5.987740  1.436741e-02  
V2157       0.000655  0.014367  1.050268e-01  

[60 rows x 60 columns]
In [ ]:
# Step 1: Store interactions and their values, avoiding self-interactions AND duplicates
interaction_results = []

for i, feature_i in enumerate(original_categorical_features):
    # Only loop j from i+1 to end, ensuring j > i
    for j in range(i + 1, len(original_categorical_features)):
        feature_j = original_categorical_features[j]

        # Get indices for feature_i and feature_j
        indices_i = [
            feature_names.tolist().index(col) 
            for col in feature_mapping[feature_i] 
            if col in feature_names.tolist()
        ]
        indices_j = [
            feature_names.tolist().index(col) 
            for col in feature_mapping[feature_j] 
            if col in feature_names.tolist()
        ]

        if not indices_i or not indices_j:
            continue

        # Compute interaction value
        value = np.sum(np.abs(interaction_values[:, indices_i, :][:, :, indices_j]))
        interaction_results.append(((feature_i, feature_j), value))

# Step 2: Sort the interactions by their absolute value (descending)
sorted_interactions = sorted(interaction_results, key=lambda x: x[1], reverse=True)

# Step 3: Select the top 30 interactions
top_30_interactions = sorted_interactions[:30]

# Step 4: Display the results
print("Top 30 Feature Interactions (Excluding Self-Interactions & Duplicates):")
for (feature_pair, interaction_value) in top_30_interactions:
    print(f"Interaction ({feature_pair[0]}, {feature_pair[1]}): {interaction_value}")

# Step 5 (Optional): Visualize the top 30 interactions
import matplotlib.pyplot as plt

# Extract feature pairs and their values
feature_pairs = [f"{pair[0]} & {pair[1]}" for pair, _ in top_30_interactions]
values = [value for _, value in top_30_interactions]

# Create a bar plot
plt.figure(figsize=(12, 8))
plt.barh(feature_pairs, values, color='skyblue')
plt.xlabel('Interaction Value')
plt.ylabel('Feature Pairs')
plt.title('Top 30 Feature Interactions (Excluding Duplicates)')
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.tight_layout()
plt.show()
Top 30 Feature Interactions (Excluding Self-Interactions & Duplicates):
Interaction (V2116, wave): 1.455742984648874
Interaction (V2105, wave): 1.136753219294689
Interaction (race, wave): 0.32361201148137897
Interaction (V2101, wave): 0.31794924169861044
Interaction (V2196, wave): 0.21851062034184396
Interaction (V2179, wave): 0.12077151708295893
Interaction (V2194, wave): 0.10428466907697606
Interaction (V2152, wave): 0.09976265557869676
Interaction (V2176, wave): 0.09669982811555715
Interaction (V2173, wave): 0.09270483934866512
Interaction (V2108, wave): 0.08984579836600001
Interaction (V2177, wave): 0.07787268823469257
Interaction (V2183, wave): 0.07011925071180561
Interaction (V2105, V13): 0.0697534793138348
Interaction (V2166, wave): 0.06839478060713346
Interaction (V2164, wave): 0.06631673096471599
Interaction (V2184, wave): 0.06471843982411986
Interaction (sex, wave): 0.06362835383535415
Interaction (V2105, V2179): 0.06331328790635543
Interaction (V13, wave): 0.05859665738852144
Interaction (V2105, V2116): 0.05634949443254901
Interaction (V2105, V2152): 0.05585307824196819
Interaction (V2175, wave): 0.05160217708296773
Interaction (V2105, V2196): 0.04333985201715526
Interaction (V2191, wave): 0.04280474158118715
Interaction (V2195, wave): 0.0426549832333175
Interaction (V2193, wave): 0.04255965855948828
Interaction (V2176, V2116): 0.041027333765607917
Interaction (V2163, wave): 0.04100182534246335
Interaction (V2201, wave): 0.03826080205088958
No description has been provided for this image